Merge changes I10d66439,I6bd62e7b,Iec85e0df,If16c5d11,If803398e, ...

* changes:
  Update SPIRV-Tools to 9559cdbd
  Squashed 'third_party/SPIRV-Tools/' changes from d14db341b..9559cdbdf
  Update SPIRV-Headers to e4322e3b
  Squashed 'third_party/SPIRV-Headers/' changes from 79b6681aa..e4322e3be
  Temporarily disable warnings-as-errors
  Kokoro (Windows): Switch to python3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c9c0902..bf8a56b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -396,9 +396,13 @@
         set_cpp_flag("-fPIC")
     endif()
 
-    if(LINUX)
+    if(WIN32)
+        set_cpp_flag("-DVK_USE_PLATFORM_WIN32_KHR")
+    elseif(LINUX)
         set_cpp_flag("-DUSE_X11=1")
         set_cpp_flag("-DVK_USE_PLATFORM_XLIB_KHR")
+    elseif(APPLE)
+        set_cpp_flag("-DVK_USE_PLATFORM_MACOS_MVK")
     endif()
 
     # Use -g3 to have even more debug info
@@ -1533,7 +1537,6 @@
         ${SOURCE_DIR}/Reactor/Reactor.cpp
         ${SOURCE_DIR}/Reactor/Reactor.hpp
         ${SOURCE_DIR}/Reactor/SubzeroReactor.cpp
-        ${SOURCE_DIR}/Reactor/Routine.cpp
         ${SOURCE_DIR}/Reactor/Optimizer.cpp
         ${SOURCE_DIR}/Reactor/Nucleus.hpp
         ${SOURCE_DIR}/Reactor/Routine.hpp
@@ -1668,7 +1671,6 @@
     ${SOURCE_DIR}/Reactor/LLVMReactorDebugInfo.cpp
     ${SOURCE_DIR}/Reactor/LLVMReactorDebugInfo.hpp
     ${SOURCE_DIR}/Reactor/Nucleus.hpp
-    ${SOURCE_DIR}/Reactor/Routine.cpp
     ${SOURCE_DIR}/Reactor/Routine.hpp
     ${SOURCE_DIR}/Reactor/CPUID.cpp
     ${SOURCE_DIR}/Reactor/CPUID.hpp
@@ -1814,6 +1816,11 @@
     list(APPEND OPENGL_COMPILER_LIST
         ${OPENGL_COMPILER_DIR}/ossource_posix.cpp
     )
+
+    list(APPEND VULKAN_LIST
+        ${SOURCE_DIR}/WSI/MacOSSurfaceMVK.mm
+        ${SOURCE_DIR}/WSI/MacOSSurfaceMVK.h
+    )
 elseif(ANDROID)
     list(APPEND SWIFTSHADER_LIST
         ${SOURCE_DIR}/Main/FrameBufferAndroid.cpp
diff --git a/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj b/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj
index 956f1fa..458f73c 100644
--- a/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj
+++ b/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj
@@ -127,7 +127,6 @@
     <ClCompile Include="$(SolutionDir)src\Reactor\LLVMReactorDebugInfo.cpp" />

     <ClInclude Include="$(SolutionDir)src\Reactor\LLVMReactorDebugInfo.hpp" />

     <ClInclude Include="$(SolutionDir)src\Reactor\Nucleus.hpp" />

-    <ClCompile Include="$(SolutionDir)src\Reactor\Routine.cpp" />

     <ClInclude Include="$(SolutionDir)src\Reactor\Routine.hpp" />

     <ClCompile Include="$(SolutionDir)src\Reactor\CPUID.cpp" />

     <ClInclude Include="$(SolutionDir)src\Reactor\CPUID.hpp" />

diff --git a/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj.filters b/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj.filters
index df9efd5..e31e252 100644
--- a/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj.filters
+++ b/build/Visual Studio 15 2017 Win64/ReactorLLVM.vcxproj.filters
@@ -10,9 +10,6 @@
     <ClCompile Include="$(SolutionDir)src\Reactor\LLVMReactorDebugInfo.cpp">

       <Filter>src\Reactor</Filter>

     </ClCompile>

-    <ClCompile Include="$(SolutionDir)src\Reactor\Routine.cpp">

-      <Filter>src\Reactor</Filter>

-    </ClCompile>

     <ClCompile Include="$(SolutionDir)src\Reactor\CPUID.cpp">

       <Filter>src\Reactor</Filter>

     </ClCompile>

diff --git a/docs/Reactor.md b/docs/Reactor.md
index 7085cc8..df016d7 100644
--- a/docs/Reactor.md
+++ b/docs/Reactor.md
@@ -61,7 +61,7 @@
 The Routine is obtained and materialized by "calling" the ```Function<>``` object to give it a name:

 

 ```C++

-Routine *routine = function("one");

+auto routine = function("one");

 ```

 

 Finally, we can obtain the function pointer to the entry point of the routine, and call it:

@@ -84,9 +84,9 @@
 {

     Int x = function.Arg<0>();

     Int y = function.Arg<1>();

-   

+

     Int sum = x + y;

-   

+

     Return(sum);

 }

 ```

@@ -119,9 +119,9 @@
 Function<Int(Float)> function;

 {

     Float x = function.Arg<0>();

-   

+

     Int cast = Int(x);

-   

+

     Return(cast);

 }

 ```

@@ -132,9 +132,9 @@
 Function<Int(Float)> function;

 {

     Float x = function.Arg<0>();

-   

+

     Int reinterpret = As<Int>(x);

-   

+

     Return(reinterpret);

 }

 ```

@@ -185,7 +185,7 @@
 Function<Float(Float)> function;

 {

     Pointer<Float> x = function.Arg<0>();

-   

+

     If(x > 0.0f)

     {

         Return(1.0f);

diff --git a/src/Android.bp b/src/Android.bp
index 554a533..8f709bc 100644
--- a/src/Android.bp
+++ b/src/Android.bp
@@ -176,7 +176,6 @@
     srcs: [
         "Reactor/Reactor.cpp",
         "Reactor/LLVMReactor.cpp",
-        "Reactor/Routine.cpp",
         "Reactor/Debug.cpp",
     ],
 
@@ -196,7 +195,6 @@
     srcs: [
         "Reactor/Reactor.cpp",
         "Reactor/LLVMReactor.cpp",
-        "Reactor/Routine.cpp",
         "Reactor/Debug.cpp",
     ],
 
@@ -223,7 +221,6 @@
     srcs: [
         "Reactor/Reactor.cpp",
         "Reactor/SubzeroReactor.cpp",
-        "Reactor/Routine.cpp",
         "Reactor/Optimizer.cpp",
         "Reactor/Debug.cpp",
     ],
diff --git a/src/Android.mk b/src/Android.mk
index 4bcd422..4777003 100644
--- a/src/Android.mk
+++ b/src/Android.mk
@@ -57,7 +57,6 @@
 
 COMMON_SRC_FILES += \
 	Reactor/Reactor.cpp \
-	Reactor/Routine.cpp \
 	Reactor/Debug.cpp \
 	Reactor/DebugAndroid.cpp \
 	Reactor/ExecutableMemory.cpp
diff --git a/src/Common/Timer.cpp b/src/Common/Timer.cpp
index db0ba4a..0fa339f 100644
--- a/src/Common/Timer.cpp
+++ b/src/Common/Timer.cpp
@@ -65,9 +65,7 @@
 				return __rdtsc();
 			#endif
 		#elif defined(__i386__) || defined(__x86_64__)
-			int64_t tsc;
-			__asm volatile("rdtsc": "=A" (tsc));
-			return tsc;
+			return __builtin_ia32_rdtsc();
 		#else
 			return 0;
 		#endif
diff --git a/src/D3D9/Direct3DDevice9.cpp b/src/D3D9/Direct3DDevice9.cpp
index 71f5f53..2f69420 100644
--- a/src/D3D9/Direct3DDevice9.cpp
+++ b/src/D3D9/Direct3DDevice9.cpp
@@ -1830,7 +1830,7 @@
 			void *destBuffer = dest->lockExternal(0, 0, 0, sw::LOCK_WRITEONLY, sw::PUBLIC);
 
 			static void (__cdecl *blitFunction)(void *dst, void *src);
-			static sw::Routine *blitRoutine;
+			static std::shared_ptr<sw::Routine> blitRoutine;
 			static sw::BlitState blitState = {};
 
 			sw::BlitState update;
@@ -1846,8 +1846,6 @@
 			if(memcmp(&blitState, &update, sizeof(sw::BlitState)) != 0)
 			{
 				blitState = update;
-				delete blitRoutine;
-
 				blitRoutine = sw::FrameBuffer::copyRoutine(blitState);
 				blitFunction = (void(__cdecl*)(void*, void*))blitRoutine->getEntry();
 			}
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 2ca5f3f..f6d714f 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -52,7 +52,7 @@
 		}
 
 		State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), { 0xF });
-		Routine *blitRoutine = getBlitRoutine(state);
+		auto blitRoutine = getBlitRoutine(state);
 		if(!blitRoutine)
 		{
 			return;
@@ -1304,7 +1304,7 @@
 		return s;
 	}
 
-	Routine *Blitter::generate(const State &state)
+	std::shared_ptr<Routine> Blitter::generate(const State &state)
 	{
 		Function<Void(Pointer<Byte>)> function;
 		{
@@ -1535,13 +1535,13 @@
 			}
 		}
 
-		return function(vk::ReactorOptimizationLevel, "BlitRoutine");
+		return function("BlitRoutine");
 	}
 
-	Routine *Blitter::getBlitRoutine(const State &state)
+	std::shared_ptr<Routine> Blitter::getBlitRoutine(const State &state)
 	{
 		std::unique_lock<std::mutex> lock(blitMutex);
-		Routine *blitRoutine = blitCache.query(state);
+		auto blitRoutine = blitCache.query(state);
 
 		if(!blitRoutine)
 		{
@@ -1559,10 +1559,10 @@
 		return blitRoutine;
 	}
 
-	Routine *Blitter::getCornerUpdateRoutine(const State &state)
+	std::shared_ptr<Routine> Blitter::getCornerUpdateRoutine(const State &state)
 	{
 		std::unique_lock<std::mutex> lock(cornerUpdateMutex);
-		Routine *cornerUpdateRoutine = cornerUpdateCache.query(state);
+		auto cornerUpdateRoutine = cornerUpdateCache.query(state);
 
 		if(!cornerUpdateRoutine)
 		{
@@ -1587,7 +1587,7 @@
 		State state(format, format.getNonQuadLayoutFormat(), VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT,
 					{false, false});
 
-		Routine *blitRoutine = getBlitRoutine(state);
+		auto blitRoutine = getBlitRoutine(state);
 		if(!blitRoutine)
 		{
 			return;
@@ -1653,7 +1653,7 @@
 		State state(format.getNonQuadLayoutFormat(), format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT,
 					{false, false});
 
-		Routine *blitRoutine = getBlitRoutine(state);
+		auto blitRoutine = getBlitRoutine(state);
 		if(!blitRoutine)
 		{
 			return;
@@ -1760,7 +1760,7 @@
 		                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
 		                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
 
-		Routine *blitRoutine = getBlitRoutine(state);
+		auto blitRoutine = getBlitRoutine(state);
 		if(!blitRoutine)
 		{
 			return;
@@ -1857,7 +1857,7 @@
 		write(c0, layer + ComputeOffset(x0, y0, pitchB, bytes, quadLayout), state);
 	}
 
-	Routine *Blitter::generateCornerUpdate(const State& state)
+	std::shared_ptr<Routine> Blitter::generateCornerUpdate(const State& state)
 	{
 		// Reading and writing from/to the same image
 		ASSERT(state.sourceFormat == state.destFormat);
@@ -1890,7 +1890,7 @@
 			}
 		}
 
-		return function(vk::ReactorOptimizationLevel, "BlitRoutine");
+		return function("BlitRoutine");
 	}
 
 	void Blitter::updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers)
@@ -1958,7 +1958,7 @@
 			UNIMPLEMENTED("Multi-sampled cube: %d samples", static_cast<int>(samples));
 		}
 
-		Routine *cornerUpdateRoutine = getCornerUpdateRoutine(state);
+		auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
 		if(!cornerUpdateRoutine)
 		{
 			return;
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index b95f14c..c8cddf3 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp
@@ -134,10 +134,10 @@
 		static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes, bool quadLayout);
 		static Float4 LinearToSRGB(Float4 &color);
 		static Float4 sRGBtoLinear(Float4 &color);
-		Routine *getBlitRoutine(const State &state);
-		Routine *generate(const State &state);
-		Routine *getCornerUpdateRoutine(const State &state);
-		Routine *generateCornerUpdate(const State& state);
+		std::shared_ptr<Routine> getBlitRoutine(const State &state);
+		std::shared_ptr<Routine> generate(const State &state);
+		std::shared_ptr<Routine> getCornerUpdateRoutine(const State &state);
+		std::shared_ptr<Routine> generateCornerUpdate(const State& state);
 		void computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state);
 
 		void copyCubeEdge(vk::Image* image,
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
index ecadc59..3203c14 100644
--- a/src/Device/Config.hpp
+++ b/src/Device/Config.hpp
@@ -49,14 +49,6 @@
 	{
 		OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
 		MIPMAP_LEVELS = 14,
-		FRAGMENT_UNIFORM_VECTORS = 264,
-		VERTEX_UNIFORM_VECTORS = 259,
-		MAX_VERTEX_INPUTS = 32,
-		MAX_VERTEX_OUTPUTS = 34,
-		MAX_FRAGMENT_INPUTS = 32,
-		MAX_FRAGMENT_UNIFORM_BLOCKS = 12,
-		MAX_VERTEX_UNIFORM_BLOCKS = 12,
-		MAX_UNIFORM_BUFFER_BINDINGS = MAX_FRAGMENT_UNIFORM_BLOCKS + MAX_VERTEX_UNIFORM_BLOCKS,   // Limited to 127 by SourceParameter.bufferIndex in Shader.hpp
 		MAX_UNIFORM_BLOCK_SIZE = 16384,
 		MAX_CLIP_PLANES = 6,
 		MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
@@ -65,8 +57,7 @@
 		MAX_PROGRAM_TEXEL_OFFSET = 7,
 		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
 		RENDERTARGETS = 8,
-		NUM_TEMPORARY_REGISTERS = 4096,
-		MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
+		MAX_INTERFACE_COMPONENTS = 16 * 4,  // Must be multiple of 4 for 16-byte alignment.
 	};
 }
 
diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
index 08b7139..ef3e123 100644
--- a/src/Device/Context.cpp
+++ b/src/Device/Context.cpp
@@ -86,7 +86,7 @@
 	void Context::init()
 	{
 		// Set vertex streams to null stream
-		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
 		{
 			input[i].defaults();
 		}
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
index 0957aeb..9a2864a 100644
--- a/src/Device/Context.hpp
+++ b/src/Device/Context.hpp
@@ -81,7 +81,7 @@
 
 		vk::DescriptorSet::Bindings descriptorSets = {};
 		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
-		Stream input[MAX_VERTEX_INPUTS];
+		Stream input[MAX_INTERFACE_COMPONENTS / 4];
 		void *indexBuffer;
 
 		vk::ImageView *renderTarget[RENDERTARGETS];
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
index 180b5b9..9f6c120 100644
--- a/src/Device/LRUCache.hpp
+++ b/src/Device/LRUCache.hpp
@@ -19,6 +19,7 @@
 
 #include <cstring>
 #include <type_traits>
+#include <unordered_map>
 
 namespace sw
 {
@@ -28,15 +29,15 @@
 	public:
 		LRUCache(int n);
 
-		~LRUCache();
+		virtual ~LRUCache();
 
-		Data *query(const Key &key) const;
-		Data *add(const Key &key, Data *data);
+		Data query(const Key &key) const;
+		virtual Data add(const Key &key, const Data &data);
 
 		int getSize() {return size;}
 		Key &getKey(int i) {return key[i];}
 
-	private:
+	protected:
 		int size;
 		int mask;
 		int top;
@@ -44,7 +45,30 @@
 
 		Key *key;
 		Key **ref;
-		Data **data;
+		Data *data;
+	};
+
+	template<class Key, class Data>
+	class LRUConstCache : public LRUCache<Key, Data>
+	{
+		using LRUBase = LRUCache<Key, Data>;
+	public:
+		LRUConstCache(int n) : LRUBase(n) {}
+		~LRUConstCache() { clearConstCache(); }
+
+		Data add(const Key &key, const Data& data) override
+		{
+			constCacheNeedsUpdate = true;
+			return LRUBase::add(key, data);
+		}
+
+		void updateConstCache();
+		Data queryConstCache(const Key &key) const;
+
+	private:
+		void clearConstCache();
+		bool constCacheNeedsUpdate = false;
+		std::unordered_map<Key, Data> constCache;
 	};
 
 	// Helper class for clearing the memory of objects at construction.
@@ -100,12 +124,10 @@
 
 		key = new Key[size];
 		ref = new Key*[size];
-		data = new Data*[size];
+		data = new Data[size];
 
 		for(int i = 0; i < size; i++)
 		{
-			data[i] = nullptr;
-
 			ref[i] = &key[i];
 		}
 	}
@@ -119,21 +141,12 @@
 		delete[] ref;
 		ref = nullptr;
 
-		for(int i = 0; i < size; i++)
-		{
-			if(data[i])
-			{
-				data[i]->unbind();
-				data[i] = nullptr;
-			}
-		}
-
 		delete[] data;
 		data = nullptr;
 	}
 
 	template<class Key, class Data>
-	Data *LRUCache<Key, Data>::query(const Key &key) const
+	Data LRUCache<Key, Data>::query(const Key &key) const
 	{
 		for(int i = top; i > top - fill; i--)
 		{
@@ -141,14 +154,14 @@
 
 			if(key == *ref[j])
 			{
-				Data *hit = data[j];
+				Data hit = data[j];
 
 				if(i != top)
 				{
 					// Move one up
 					int k = (j + 1) & mask;
 
-					Data *swapD = data[k];
+					Data swapD = data[k];
 					data[k] = data[j];
 					data[j] = swapD;
 
@@ -165,24 +178,48 @@
 	}
 
 	template<class Key, class Data>
-	Data *LRUCache<Key, Data>::add(const Key &key, Data *data)
+	Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
 	{
 		top = (top + 1) & mask;
 		fill = fill + 1 < size ? fill + 1 : size;
 
 		*ref[top] = key;
-
-		data->bind();
-
-		if(this->data[top])
-		{
-			this->data[top]->unbind();
-		}
-
 		this->data[top] = data;
 
 		return data;
 	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::clearConstCache()
+	{
+		constCache.clear();
+	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::updateConstCache()
+	{
+		if(constCacheNeedsUpdate)
+		{
+			clearConstCache();
+
+			for(int i = 0; i < LRUBase::size; i++)
+			{
+				if(LRUBase::data[i])
+				{
+					constCache[*LRUBase::ref[i]] = LRUBase::data[i];
+				}
+			}
+
+			constCacheNeedsUpdate = false;
+		}
+	}
+
+	template<class Key, class Data>
+	Data LRUConstCache<Key, Data>::queryConstCache(const Key &key) const
+	{
+		auto it = constCache.find(key);
+		return (it != constCache.end()) ? it->second : nullptr;
+	}
 }
 
 #endif   // sw_LRUCache_hpp
diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
index 51f4517..d6593b2 100644
--- a/src/Device/PixelProcessor.cpp
+++ b/src/Device/PixelProcessor.cpp
@@ -227,18 +227,18 @@
 		return state;
 	}
 
-	Routine *PixelProcessor::routine(const State &state,
+	std::shared_ptr<Routine> PixelProcessor::routine(const State &state,
 		vk::PipelineLayout const *pipelineLayout,
 		SpirvShader const *pixelShader,
 		const vk::DescriptorSet::Bindings &descriptorSets)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)
 		{
 			QuadRasterizer *generator = new PixelProgram(state, pipelineLayout, pixelShader, descriptorSets);
 			generator->generate();
-			routine = (*generator)(vk::ReactorOptimizationLevel, "PixelRoutine_%0.8X", state.shaderID);
+			routine = (*generator)("PixelRoutine_%0.8X", state.shaderID);
 			delete generator;
 
 			routineCache->add(state, routine);
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
index e48f677..17cda85 100644
--- a/src/Device/PixelProcessor.hpp
+++ b/src/Device/PixelProcessor.hpp
@@ -129,8 +129,8 @@
 
 	protected:
 		const State update(const Context* context) const;
-		Routine *routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                 SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
+		std::shared_ptr<Routine> routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+		                                 SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
 		void setRoutineCacheSize(int routineCacheSize);
 
 		// Other semi-constants
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index ddba431..f37e261 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -26,6 +26,7 @@
 #include "System/Timer.hpp"
 #include "Vulkan/VkConfig.h"
 #include "Vulkan/VkDebug.hpp"
+#include "Vulkan/VkDevice.hpp"
 #include "Vulkan/VkFence.hpp"
 #include "Vulkan/VkImageView.hpp"
 #include "Vulkan/VkQueryPool.hpp"
@@ -162,7 +163,7 @@
 		deallocate(data);
 	}
 
-	Renderer::Renderer()
+	Renderer::Renderer(vk::Device* device) : device(device)
 	{
 		for(int i = 0; i < 16; i++)
 		{
@@ -339,10 +340,6 @@
 		draw->indexType = indexType;
 		draw->batchSize = batch;
 
-		vertexRoutine->bind();
-		setupRoutine->bind();
-		pixelRoutine->bind();
-
 		draw->vertexRoutine = vertexRoutine;
 		draw->setupRoutine = setupRoutine;
 		draw->pixelRoutine = pixelRoutine;
@@ -363,7 +360,7 @@
 		ASSERT(!draw->events);
 		draw->events = events;
 
-		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
 		{
 			data->input[i] = context->input[i].buffer;
 			data->stride[i] = context->input[i].vertexStride;
@@ -733,6 +730,7 @@
 	void Renderer::synchronize()
 	{
 		sync.wait();
+		device->updateSamplingRoutineConstCache();
 	}
 
 	void Renderer::finishRendering(Task &pixelTask)
@@ -785,9 +783,9 @@
 					draw.queries = nullptr;
 				}
 
-				draw.vertexRoutine->unbind();
-				draw.setupRoutine->unbind();
-				draw.pixelRoutine->unbind();
+				draw.vertexRoutine.reset();
+				draw.setupRoutine.reset();
+				draw.pixelRoutine.reset();
 
 				if(draw.events)
 				{
@@ -1196,12 +1194,6 @@
 		P[3].y -= Y;
 		C[3] = Clipper::ComputeClipFlags(P[3]);
 
-		triangle.v1 = triangle.v0;
-		triangle.v2 = triangle.v0;
-
-		triangle.v1.projected.x += iround(16 * 0.5f * pSize);
-		triangle.v2.projected.y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
-
 		Polygon polygon(P, 4);
 
 		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
@@ -1216,6 +1208,11 @@
 				}
 			}
 
+			triangle.v1 = triangle.v0;
+			triangle.v2 = triangle.v0;
+
+			triangle.v1.projected.x += iround(16 * 0.5f * pSize);
+			triangle.v2.projected.y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 			return setupRoutine(&primitive, &triangle, &polygon, &data);
 		}
 
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index 85367f1..2ae2ca5 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -32,6 +32,7 @@
 namespace vk
 {
 	class DescriptorSet;
+	class Device;
 	class Query;
 }
 
@@ -52,8 +53,8 @@
 		vk::DescriptorSet::Bindings descriptorSets = {};
 		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
 
-		const void *input[MAX_VERTEX_INPUTS];
-		unsigned int stride[MAX_VERTEX_INPUTS];
+		const void *input[MAX_INTERFACE_COMPONENTS / 4];
+		unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
 		const void *indices;
 
 		int instanceID;
@@ -156,7 +157,7 @@
 		};
 
 	public:
-		Renderer();
+		Renderer(vk::Device* device);
 
 		virtual ~Renderer();
 
@@ -251,9 +252,11 @@
 		SetupProcessor::State setupState;
 		PixelProcessor::State pixelState;
 
-		Routine *vertexRoutine;
-		Routine *setupRoutine;
-		Routine *pixelRoutine;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
+
+		vk::Device* device;
 	};
 
 	struct DrawCall
@@ -266,9 +269,9 @@
 		std::atomic<int> indexType;
 		std::atomic<int> batchSize;
 
-		Routine *vertexRoutine;
-		Routine *setupRoutine;
-		Routine *pixelRoutine;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
 
 		VertexProcessor::RoutinePointer vertexPointer;
 		SetupProcessor::RoutinePointer setupPointer;
diff --git a/src/Device/RoutineCache.hpp b/src/Device/RoutineCache.hpp
index 8420468..61f635a 100644
--- a/src/Device/RoutineCache.hpp
+++ b/src/Device/RoutineCache.hpp
@@ -24,7 +24,7 @@
 	using namespace rr;
 
 	template<class State>
-	using RoutineCache = LRUCache<State, Routine>;
+	using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
 }
 
 #endif   // sw_RoutineCache_hpp
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
index 8294514..abf4c36 100644
--- a/src/Device/SetupProcessor.cpp
+++ b/src/Device/SetupProcessor.cpp
@@ -94,9 +94,9 @@
 		return state;
 	}
 
-	Routine *SetupProcessor::routine(const State &state)
+	std::shared_ptr<Routine> SetupProcessor::routine(const State &state)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)
 		{
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
index c82b8b5..a84f818 100644
--- a/src/Device/SetupProcessor.hpp
+++ b/src/Device/SetupProcessor.hpp
@@ -67,7 +67,7 @@
 
 	protected:
 		State update(const sw::Context* context) const;
-		Routine *routine(const State &state);
+		std::shared_ptr<Routine> routine(const State &state);
 
 		void setRoutineCacheSize(int cacheSize);
 
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index 76daf76..c6e5c13 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -79,7 +79,7 @@
 
 		state.shaderID = context->vertexShader->getSerialID();
 
-		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
 		{
 			state.input[i].type = context->input[i].type;
 			state.input[i].count = context->input[i].count;
@@ -94,18 +94,18 @@
 		return state;
 	}
 
-	Routine *VertexProcessor::routine(const State &state,
-	                                  vk::PipelineLayout const *pipelineLayout,
-	                                  SpirvShader const *vertexShader,
-	                                  const vk::DescriptorSet::Bindings &descriptorSets)
+	std::shared_ptr<Routine> VertexProcessor::routine(const State &state,
+	                                                  vk::PipelineLayout const *pipelineLayout,
+	                                                  SpirvShader const *vertexShader,
+	                                                  const vk::DescriptorSet::Bindings &descriptorSets)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)   // Create one
 		{
 			VertexRoutine *generator = new VertexProgram(state, pipelineLayout, vertexShader, descriptorSets);
 			generator->generate();
-			routine = (*generator)(vk::ReactorOptimizationLevel, "VertexRoutine_%0.8X", state.shaderID);
+			routine = (*generator)("VertexRoutine_%0.8X", state.shaderID);
 			delete generator;
 
 			routineCache->add(state, routine);
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index ce91582..24a9b3f 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -70,7 +70,7 @@
 				unsigned int attribType : BITS(SpirvShader::ATTRIBTYPE_LAST);
 			};
 
-			Input input[MAX_VERTEX_INPUTS];
+			Input input[MAX_INTERFACE_COMPONENTS / 4];
 		};
 
 		struct State : States
@@ -88,8 +88,8 @@
 
 	protected:
 		const State update(const sw::Context* context);
-		Routine *routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                 SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
+		std::shared_ptr<Routine> routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+		                                 SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
 
 		void setRoutineCacheSize(int cacheSize);
 
diff --git a/src/Main/FrameBuffer.cpp b/src/Main/FrameBuffer.cpp
index 7d2e6a0..5e4f6c3 100644
--- a/src/Main/FrameBuffer.cpp
+++ b/src/Main/FrameBuffer.cpp
@@ -66,8 +66,6 @@
 			blitThread->join();
 			delete blitThread;
 		}
-
-		delete blitRoutine;
 	}
 
 	void FrameBuffer::setCursorImage(sw::Surface *cursorImage)
@@ -154,8 +152,6 @@
 		if(memcmp(&blitState, &updateState, sizeof(BlitState)) != 0)
 		{
 			blitState = updateState;
-			delete blitRoutine;
-
 			blitRoutine = copyRoutine(blitState);
 			blitFunction = (void(*)(void*, void*, Cursor*))blitRoutine->getEntry();
 		}
@@ -163,7 +159,7 @@
 		blitFunction(framebuffer, renderbuffer, &cursor);
 	}
 
-	Routine *FrameBuffer::copyRoutine(const BlitState &state)
+	std::shared_ptr<Routine> FrameBuffer::copyRoutine(const BlitState &state)
 	{
 		const int width = state.width;
 		const int height = state.height;
diff --git a/src/Main/FrameBuffer.hpp b/src/Main/FrameBuffer.hpp
index f07feb3..94a1890 100644
--- a/src/Main/FrameBuffer.hpp
+++ b/src/Main/FrameBuffer.hpp
@@ -54,7 +54,7 @@
 		static void setCursorOrigin(int x0, int y0);
 		static void setCursorPosition(int x, int y);
 
-		static Routine *copyRoutine(const BlitState &state);
+		static std::shared_ptr<Routine> copyRoutine(const BlitState &state);
 
 	protected:
 		void copy(sw::Surface *source);
@@ -90,7 +90,7 @@
 		static Cursor cursor;
 
 		void (*blitFunction)(void *dst, void *src, Cursor *cursor);
-		Routine *blitRoutine;
+		std::shared_ptr<Routine> blitRoutine;
 		BlitState blitState;     // State of the current blitRoutine.
 		BlitState updateState;   // State of the routine to be generated.
 
diff --git a/src/Main/SwiftConfig.cpp b/src/Main/SwiftConfig.cpp
index aa17aa8..5876054 100644
--- a/src/Main/SwiftConfig.cpp
+++ b/src/Main/SwiftConfig.cpp
@@ -401,19 +401,20 @@
 		html += "<h2><em>Compiler optimizations</em></h2>\n";
 		html += "<table>\n";
 
-		for(int pass = 0; pass < 10; pass++)
+		for(size_t pass = 0; pass < config.optimization.size(); pass++)
 		{
 			html += "<tr><td>Optimization pass " + itoa(pass + 1) + ":</td><td><select name='optimization" + itoa(pass + 1) + "' title='An optimization pass for the shader compiler.'>\n";
-			html += "<option value='0'"  + (config.optimization[pass] == 0  ? selected : empty) + ">Disabled" + (pass > 0 ? " (default)" : "") + "</option>\n";
-			html += "<option value='1'"  + (config.optimization[pass] == 1  ? selected : empty) + ">Instruction Combining" + (pass == 0 ? " (default)" : "") + "</option>\n";
-			html += "<option value='2'"  + (config.optimization[pass] == 2  ? selected : empty) + ">Control Flow Simplification</option>\n";
-			html += "<option value='3'"  + (config.optimization[pass] == 3  ? selected : empty) + ">Loop Invariant Code Motion</option>\n";
-			html += "<option value='4'"  + (config.optimization[pass] == 4  ? selected : empty) + ">Aggressive Dead Code Elimination</option>\n";
-			html += "<option value='5'"  + (config.optimization[pass] == 5  ? selected : empty) + ">Global Value Numbering</option>\n";
-			html += "<option value='6'"  + (config.optimization[pass] == 6  ? selected : empty) + ">Commutative Expressions Reassociation</option>\n";
-			html += "<option value='7'"  + (config.optimization[pass] == 7  ? selected : empty) + ">Dead Store Elimination</option>\n";
-			html += "<option value='8'"  + (config.optimization[pass] == 8  ? selected : empty) + ">Sparse Conditional Copy Propagation</option>\n";
-			html += "<option value='9'"  + (config.optimization[pass] == 9  ? selected : empty) + ">Scalar Replacement of Aggregates</option>\n";
+			html += "<option value='0'"   + (config.optimization[pass] == rr::Optimization::Pass::Disabled ? selected : empty) + ">Disabled" + (pass > 0 ? " (default)" : "") + "</option>\n";
+			html += "<option value='1'"   + (config.optimization[pass] == rr::Optimization::Pass::InstructionCombining ? selected : empty) + ">Instruction Combining" + (pass == 0 ? " (default)" : "") + "</option>\n";
+			html += "<option value='2'"   + (config.optimization[pass] == rr::Optimization::Pass::CFGSimplification ? selected : empty) + ">Control Flow Simplification</option>\n";
+			html += "<option value='3'"   + (config.optimization[pass] == rr::Optimization::Pass::LICM ? selected : empty) + ">Loop Invariant Code Motion</option>\n";
+			html += "<option value='4'"   + (config.optimization[pass] == rr::Optimization::Pass::AggressiveDCE ? selected : empty) + ">Aggressive Dead Code Elimination</option>\n";
+			html += "<option value='5'"   + (config.optimization[pass] == rr::Optimization::Pass::GVN ? selected : empty) + ">Global Value Numbering</option>\n";
+			html += "<option value='6'"   + (config.optimization[pass] == rr::Optimization::Pass::Reassociate ? selected : empty) + ">Commutative Expressions Reassociation</option>\n";
+			html += "<option value='7'"   + (config.optimization[pass] == rr::Optimization::Pass::DeadStoreElimination ? selected : empty) + ">Dead Store Elimination</option>\n";
+			html += "<option value='8'"   + (config.optimization[pass] == rr::Optimization::Pass::SCCP ? selected : empty) + ">Sparse Conditional Copy Propagation</option>\n";
+			html += "<option value='9'"   + (config.optimization[pass] == rr::Optimization::Pass::ScalarReplAggregates ? selected : empty) + ">Scalar Replacement of Aggregates</option>\n";
+			html += "<option value='10'"  + (config.optimization[pass] == rr::Optimization::Pass::EarlyCSEPass ? selected : empty) + ">Eliminate trivially redundant instructions</option>\n";
 			html += "</select></td></tr>\n";
 		}
 
@@ -652,7 +653,7 @@
 			}
 			else if(sscanf(post, "optimization%d=%d", &index, &integer))
 			{
-				config.optimization[index - 1] = (rr::Optimization)integer;
+				config.optimization[index - 1] = (rr::Optimization::Pass)integer;
 			}
 			else if(strstr(post, "disableServer=on"))
 			{
@@ -737,9 +738,10 @@
 		config.enableSSSE3 = ini.getBoolean("Processor", "EnableSSSE3", true);
 		config.enableSSE4_1 = ini.getBoolean("Processor", "EnableSSE4_1", true);
 
-		for(int pass = 0; pass < 10; pass++)
+		for(size_t pass = 0; pass < config.optimization.size(); pass++)
 		{
-			config.optimization[pass] = (rr::Optimization)ini.getInteger("Optimization", "OptimizationPass" + itoa(pass + 1), pass == 0 ? rr::InstructionCombining : rr::Disabled);
+			auto def = pass == 0 ? rr::Optimization::Pass::InstructionCombining : rr::Optimization::Pass::Disabled;
+			config.optimization[pass] = (rr::Optimization::Pass)ini.getInteger("Optimization", "OptimizationPass" + itoa(pass + 1), (int)def);
 		}
 
 		config.disableServer = ini.getBoolean("Testing", "DisableServer", false);
@@ -795,9 +797,9 @@
 		ini.addValue("Processor", "EnableSSSE3", itoa(config.enableSSSE3));
 		ini.addValue("Processor", "EnableSSE4_1", itoa(config.enableSSE4_1));
 
-		for(int pass = 0; pass < 10; pass++)
+		for(size_t pass = 0; pass < config.optimization.size(); pass++)
 		{
-			ini.addValue("Optimization", "OptimizationPass" + itoa(pass + 1), itoa(config.optimization[pass]));
+			ini.addValue("Optimization", "OptimizationPass" + itoa(pass + 1), itoa((int)config.optimization[pass]));
 		}
 
 		ini.addValue("Testing", "DisableServer", itoa(config.disableServer));
diff --git a/src/Main/SwiftConfig.hpp b/src/Main/SwiftConfig.hpp
index ad3dcb5..a40648c 100644
--- a/src/Main/SwiftConfig.hpp
+++ b/src/Main/SwiftConfig.hpp
@@ -21,6 +21,7 @@
 #include "Common/MutexLock.hpp"
 #include "Common/Socket.hpp"
 
+#include <array>
 #include <string>
 
 namespace sw
@@ -48,7 +49,7 @@
 			bool enableSSE3;
 			bool enableSSSE3;
 			bool enableSSE4_1;
-			rr::Optimization optimization[10];
+			std::array<rr::Optimization::Pass, 10> optimization;
 			bool disableServer;
 			bool keepSystemCursor;
 			bool forceWindowed;
diff --git a/src/OpenGL/compiler/Compiler.vcxproj b/src/OpenGL/compiler/Compiler.vcxproj
index fbe01c5..5964bce 100644
--- a/src/OpenGL/compiler/Compiler.vcxproj
+++ b/src/OpenGL/compiler/Compiler.vcxproj
@@ -125,7 +125,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

@@ -145,7 +145,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

@@ -164,7 +164,7 @@
       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

@@ -183,7 +183,7 @@
       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

@@ -203,7 +203,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <OmitFramePointers>false</OmitFramePointers>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -222,7 +222,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <OmitFramePointers>false</OmitFramePointers>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <ErrorReporting>Queue</ErrorReporting>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

diff --git a/src/OpenGL/compiler/preprocessor/preprocessor.vcxproj b/src/OpenGL/compiler/preprocessor/preprocessor.vcxproj
index 07f364d..b2308ef 100644
--- a/src/OpenGL/compiler/preprocessor/preprocessor.vcxproj
+++ b/src/OpenGL/compiler/preprocessor/preprocessor.vcxproj
@@ -89,7 +89,7 @@
       <Optimization>Disabled</Optimization>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>4005;</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;4005;</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -104,7 +104,7 @@
       <Optimization>Disabled</Optimization>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -121,7 +121,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>4005;</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;4005;</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -140,7 +140,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;4005;4267;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

diff --git a/src/OpenGL/libEGL/BUILD.gn b/src/OpenGL/libEGL/BUILD.gn
index 993ca6a..a17a3a7 100644
--- a/src/OpenGL/libEGL/BUILD.gn
+++ b/src/OpenGL/libEGL/BUILD.gn
@@ -73,7 +73,11 @@
       "CoreFoundation.framework",
       "IOSurface.framework",
     ]
-    ldflags = [ "-Wl,-install_name,@rpath/libswiftshader_libEGL.dylib" ]
+    ldflags = [
+      "-Wl,-install_name,@rpath/libswiftshader_libEGL.dylib",
+      "-Wl,-exported_symbols_list," +
+          rebase_path("libEGL.exports", root_build_dir),
+    ]
   } else if (is_linux) {
     if (use_x11) {
       sources += [ "../../Main/libX11.cpp" ]
diff --git a/src/OpenGL/libEGL/libEGL.cpp b/src/OpenGL/libEGL/libEGL.cpp
index 51f5309..70df27f 100644
--- a/src/OpenGL/libEGL/libEGL.cpp
+++ b/src/OpenGL/libEGL/libEGL.cpp
@@ -932,13 +932,6 @@
 		return EGL_FALSE;
 	}
 
-	if((draw != EGL_NO_SURFACE && drawSurface->hasClientBuffer()) ||
-	   (read != EGL_NO_SURFACE && readSurface->hasClientBuffer()))
-	{
-		// Make current is not supported on IOSurface pbuffers.
-		return error(EGL_BAD_SURFACE, EGL_FALSE);
-	}
-
 	if((draw != EGL_NO_SURFACE) ^ (read != EGL_NO_SURFACE))
 	{
 		return error(EGL_BAD_MATCH, EGL_FALSE);
diff --git a/src/OpenGL/libEGL/libEGL.vcxproj b/src/OpenGL/libEGL/libEGL.vcxproj
index 64c35fd..7c6d827 100644
--- a/src/OpenGL/libEGL/libEGL.vcxproj
+++ b/src/OpenGL/libEGL/libEGL.vcxproj
@@ -131,7 +131,7 @@
       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -164,7 +164,7 @@
       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -198,7 +198,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -232,7 +232,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -268,7 +268,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <OmitFramePointers>false</OmitFramePointers>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -302,7 +302,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <OmitFramePointers>false</OmitFramePointers>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

diff --git a/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj b/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
index b5da9ce..28d6728 100644
--- a/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
+++ b/src/OpenGL/libGLES_CM/libGLES_CM.vcxproj
@@ -139,6 +139,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -169,6 +170,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -206,6 +208,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -244,6 +247,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -282,6 +286,7 @@
       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -318,6 +323,7 @@
       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>dxguid.lib;WS2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>

diff --git a/src/OpenGL/libGLESv2/BUILD.gn b/src/OpenGL/libGLESv2/BUILD.gn
index d7037ca..6fa7dff 100644
--- a/src/OpenGL/libGLESv2/BUILD.gn
+++ b/src/OpenGL/libGLESv2/BUILD.gn
@@ -125,7 +125,11 @@
   ]
 
   if (is_mac) {
-    ldflags = [ "-Wl,-install_name,@rpath/libswiftshader_libGLESv2.dylib" ]
+    ldflags = [
+      "-Wl,-install_name,@rpath/libswiftshader_libGLESv2.dylib",
+      "-Wl,-exported_symbols_list," +
+          rebase_path("libGLESv2.exports", root_build_dir),
+    ]
   } else if (is_linux) {
     inputs = [
       "libGLESv2.lds",
diff --git a/src/OpenGL/libGLESv2/libGLESv2.vcxproj b/src/OpenGL/libGLESv2/libGLESv2.vcxproj
index 8793995..a6d72c5 100644
--- a/src/OpenGL/libGLESv2/libGLESv2.vcxproj
+++ b/src/OpenGL/libGLESv2/libGLESv2.vcxproj
@@ -136,7 +136,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -168,7 +168,7 @@
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

       <BrowseInformation>true</BrowseInformation>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -207,7 +207,7 @@
       <WholeProgramOptimization>true</WholeProgramOptimization>

       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -247,7 +247,7 @@
       <WholeProgramOptimization>true</WholeProgramOptimization>

       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

@@ -288,7 +288,7 @@
       <WholeProgramOptimization>true</WholeProgramOptimization>

       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

@@ -326,7 +326,7 @@
       <WholeProgramOptimization>true</WholeProgramOptimization>

       <IntrinsicFunctions>false</IntrinsicFunctions>

       <TreatWarningAsError>true</TreatWarningAsError>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

     </ClCompile>

diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index a769a2c..85c3783 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -181,6 +181,7 @@
 				case VK_FORMAT_R8_UNORM:
 				case VK_FORMAT_R8G8_UNORM:
 				case VK_FORMAT_R8G8B8A8_UNORM:
+				case VK_FORMAT_B8G8R8A8_UNORM:
 				case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
 				case VK_FORMAT_B8G8R8A8_SRGB:
 				case VK_FORMAT_R8G8B8A8_SRGB:
@@ -234,6 +235,7 @@
 			case VK_FORMAT_R8_UNORM:
 			case VK_FORMAT_R8G8_UNORM:
 			case VK_FORMAT_R8G8B8A8_UNORM:
+			case VK_FORMAT_B8G8R8A8_UNORM:
 			case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
 			case VK_FORMAT_B8G8R8A8_SRGB:
 			case VK_FORMAT_R8G8B8A8_SRGB:
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 1973a71..80ac5df 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -453,7 +453,7 @@
 			Return(1);
 		}
 
-		routine = function(vk::ReactorOptimizationLevel, "SetupRoutine");
+		routine = function("SetupRoutine");
 	}
 
 	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool perspective, int component)
@@ -601,7 +601,7 @@
 		#endif
 	}
 
-	Routine *SetupRoutine::getRoutine()
+	std::shared_ptr<Routine> SetupRoutine::getRoutine()
 	{
 		return routine;
 	}
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
index 0ea0c71..469b4de 100644
--- a/src/Pipeline/SetupRoutine.hpp
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -30,7 +30,7 @@
 		virtual ~SetupRoutine();
 
 		void generate();
-		Routine *getRoutine();
+		std::shared_ptr<Routine> getRoutine();
 
 	private:
 		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool perspective, int component);
@@ -40,7 +40,7 @@
 
 		const SetupProcessor::State &state;
 
-		Routine *routine;
+		std::shared_ptr<Routine> routine;
 	};
 }
 
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index ce621ee..dd04fc9 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -287,11 +287,11 @@
 	{
 
 		template<typename T>
-		T Load(Pointer ptr, bool robust, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+		T Load(Pointer ptr, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
 		{
 			using EL = typename Element<T>::type;
 
-			if (ptr.isStaticAllInBounds(sizeof(float)))
+			if (ptr.isStaticallyInBounds(sizeof(float), robustness))
 			{
 				// All elements are statically known to be in-bounds.
 				// We can avoid costly conditional on masks.
@@ -307,9 +307,19 @@
 					return T(*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment));
 				}
 			}
-			else if(robust)  // Disable OOB reads.
+			else
 			{
-				mask &= ptr.isInBounds(sizeof(float));
+				switch(robustness)
+				{
+				case OutOfBoundsBehavior::Nullify:
+				case OutOfBoundsBehavior::RobustBufferAccess:
+				case OutOfBoundsBehavior::UndefinedValue:
+					mask &= ptr.isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
+					break;
+				case OutOfBoundsBehavior::UndefinedBehavior:
+					// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+					break;
+				}
 			}
 
 			auto offsets = ptr.offsets();
@@ -329,11 +339,26 @@
 					}
 					return out;
 				}
+
+				bool zeroMaskedLanes = true;
+				switch(robustness)
+				{
+				case OutOfBoundsBehavior::Nullify:
+				case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
+					zeroMaskedLanes = true;
+					break;
+				case OutOfBoundsBehavior::UndefinedValue:
+				case OutOfBoundsBehavior::UndefinedBehavior:
+					zeroMaskedLanes = false;
+					break;
+				}
+
 				if (ptr.hasStaticSequentialOffsets(sizeof(float)))
 				{
-					return rr::MaskedLoad(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), mask, alignment, robust);
+					return rr::MaskedLoad(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), mask, alignment, zeroMaskedLanes);
 				}
-				return rr::Gather(rr::Pointer<EL>(ptr.base), offsets, mask, alignment, robust);
+
+				return rr::Gather(rr::Pointer<EL>(ptr.base), offsets, mask, alignment, zeroMaskedLanes);
 			}
 			else
 			{
@@ -370,15 +395,22 @@
 		}
 
 		template<typename T>
-		void Store(Pointer ptr, T val, bool robust, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+		void Store(Pointer ptr, T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 		{
 			using EL = typename Element<T>::type;
 			constexpr size_t alignment = sizeof(float);
 			auto offsets = ptr.offsets();
 
-			if(robust)  // Disable OOB writes.
+			switch(robustness)
 			{
-				mask &= ptr.isInBounds(sizeof(float));
+			case OutOfBoundsBehavior::Nullify:
+			case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
+			case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
+				mask &= ptr.isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+				break;
+			case OutOfBoundsBehavior::UndefinedBehavior:
+				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+				break;
 			}
 
 			if (!atomic && order == std::memory_order_relaxed)
@@ -400,7 +432,7 @@
 				}
 				else if (ptr.hasStaticSequentialOffsets(sizeof(float)))
 				{
-					if (ptr.isStaticAllInBounds(sizeof(float)))
+					if (ptr.isStaticallyInBounds(sizeof(float), robustness))
 					{
 						// Pointer has no elements OOB, and the store is not atomic.
 						// Perform a RMW.
@@ -487,7 +519,7 @@
 			{
 			case spv::OpEntryPoint:
 			{
-				auto executionModel = spv::ExecutionModel(insn.word(1));
+				executionModel = spv::ExecutionModel(insn.word(1));
 				auto id = Function::ID(insn.word(2));
 				auto name = insn.string(3);
 				auto stage = executionModelToStage(executionModel);
@@ -1967,6 +1999,36 @@
 		object.definition = insn;
 	}
 
+	OutOfBoundsBehavior SpirvShader::EmitState::getOutOfBoundsBehavior(spv::StorageClass storageClass) const
+	{
+		switch(storageClass)
+		{
+		case spv::StorageClassUniform:
+		case spv::StorageClassStorageBuffer:
+			// Buffer resource access. robustBufferAccess feature applies.
+			return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
+			                          : OutOfBoundsBehavior::UndefinedBehavior;
+
+		case spv::StorageClassImage:
+			return OutOfBoundsBehavior::UndefinedValue;  // "The value returned by a read of an invalid texel is undefined"
+
+		case spv::StorageClassInput:
+			if(executionModel == spv::ExecutionModelVertex)
+			{
+				// Vertex attributes follow robustBufferAccess rules.
+				return robustBufferAccess ? OutOfBoundsBehavior::RobustBufferAccess
+				                          : OutOfBoundsBehavior::UndefinedBehavior;
+			}
+			// Fall through to default case.
+		default:
+			// TODO(b/137183137): Optimize if the pointer resulted from OpInBoundsAccessChain.
+			// TODO(b/131224163): Optimize cases statically known to be within bounds.
+			return OutOfBoundsBehavior::UndefinedValue;
+		}
+
+		return OutOfBoundsBehavior::Nullify;
+	}
+
 	// emit-time
 
 	void SpirvShader::emitProlog(SpirvRoutine *routine) const
@@ -2004,7 +2066,7 @@
 
 	void SpirvShader::emit(SpirvRoutine *routine, RValue<SIMD::Int> const &activeLaneMask, const vk::DescriptorSet::Bindings &descriptorSets) const
 	{
-		EmitState state(routine, entryPoint, activeLaneMask, descriptorSets, robustBufferAccess);
+		EmitState state(routine, entryPoint, activeLaneMask, descriptorSets, robustBufferAccess, executionModel);
 
 		// Emit everything up to the first label
 		// TODO: Separate out dispatch of block from non-block instructions?
@@ -2743,7 +2805,8 @@
 				{
 					auto p = ptr + offset;
 					if (interleavedByLane) { p = interleaveByLane(p); }
-					SIMD::Store(p, initialValue.Float(i), state->robust, state->activeLaneMask());
+					auto robustness = OutOfBoundsBehavior::UndefinedBehavior;  // Local variables are always within bounds.
+					SIMD::Store(p, initialValue.Float(i), robustness, state->activeLaneMask());
 				});
 				break;
 			}
@@ -2786,16 +2849,15 @@
 		}
 
 		auto ptr = GetPointerToData(pointerId, 0, state);
-
 		bool interleavedByLane = IsStorageInterleavedByLane(pointerTy.storageClass);
-
 		auto &dst = state->createIntermediate(resultId, resultTy.sizeInComponents);
+		auto robustness = state->getOutOfBoundsBehavior(pointerTy.storageClass);
 
 		VisitMemoryObject(pointerId, [&](uint32_t i, uint32_t offset)
 		{
 			auto p = ptr + offset;
-			if (interleavedByLane) { p = interleaveByLane(p); }
-			dst.move(i, SIMD::Load<SIMD::Float>(p, state->robust, state->activeLaneMask(), atomic, memoryOrder));
+			if (interleavedByLane) { p = interleaveByLane(p); }  // TODO: Interleave once, then add offset?
+			dst.move(i, SIMD::Load<SIMD::Float>(p, robustness, state->activeLaneMask(), atomic, memoryOrder));
 		});
 
 		return EmitResult::Continue;
@@ -2823,6 +2885,7 @@
 
 		auto ptr = GetPointerToData(pointerId, 0, state);
 		bool interleavedByLane = IsStorageInterleavedByLane(pointerTy.storageClass);
+		auto robustness = state->getOutOfBoundsBehavior(pointerTy.storageClass);
 
 		if (object.kind == Object::Kind::Constant)
 		{
@@ -2832,7 +2895,7 @@
 			{
 				auto p = ptr + offset;
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, SIMD::Float(src[i]), state->robust, state->activeLaneMask(), atomic, memoryOrder);
+				SIMD::Store(p, SIMD::Float(src[i]), robustness, state->activeLaneMask(), atomic, memoryOrder);
 			});
 		}
 		else
@@ -2843,7 +2906,7 @@
 			{
 				auto p = ptr + offset;
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, src.Float(i), state->robust, state->activeLaneMask(), atomic, memoryOrder);
+				SIMD::Store(p, src.Float(i), robustness, state->activeLaneMask(), atomic, memoryOrder);
 			});
 		}
 
@@ -3891,6 +3954,11 @@
 			auto ptrTy = getType(getObject(ptrId).type);
 			auto ptr = GetPointerToData(ptrId, 0, state);
 			bool interleavedByLane = IsStorageInterleavedByLane(ptrTy.storageClass);
+			// TODO: GLSL modf() takes an output parameter and thus the pointer is assumed
+			// to be in bounds even for inactive lanes.
+			// - Clarify the SPIR-V spec.
+			// - Eliminate lane masking and assume interleaving.
+			auto robustness = OutOfBoundsBehavior::UndefinedBehavior;
 
 			for (auto i = 0u; i < type.sizeInComponents; i++)
 			{
@@ -3899,7 +3967,7 @@
 				dst.move(i, frac);
 				auto p = ptr + (i * sizeof(float));
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, whole, state->robust, state->activeLaneMask());
+				SIMD::Store(p, whole, robustness, state->activeLaneMask());
 			}
 			break;
 		}
@@ -4024,6 +4092,11 @@
 			auto ptrTy = getType(getObject(ptrId).type);
 			auto ptr = GetPointerToData(ptrId, 0, state);
 			bool interleavedByLane = IsStorageInterleavedByLane(ptrTy.storageClass);
+			// TODO: GLSL frexp() takes an output parameter and thus the pointer is assumed
+			// to be in bounds even for inactive lanes.
+			// - Clarify the SPIR-V spec.
+			// - Eliminate lane masking and assume interleaving.
+			auto robustness = OutOfBoundsBehavior::UndefinedBehavior;
 
 			for (auto i = 0u; i < type.sizeInComponents; i++)
 			{
@@ -4035,7 +4108,7 @@
 
 				auto p = ptr + (i * sizeof(float));
 				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, exponent, state->robust, state->activeLaneMask());
+				SIMD::Store(p, exponent, robustness, state->activeLaneMask());
 			}
 			break;
 		}
@@ -5245,13 +5318,18 @@
 		auto basePtr = SIMD::Pointer(imageBase, imageSizeInBytes);
 		auto texelPtr = GetTexelAddress(state, basePtr, coordinate, imageType, binding, texelSize, sampleId, useStencilAspect);
 
+		// "The value returned by a read of an invalid texel is undefined,
+		//  unless that read operation is from a buffer resource and the robustBufferAccess feature is enabled."
+		// TODO: Don't always assume a buffer resource.
+		auto robustness = OutOfBoundsBehavior::RobustBufferAccess;
+
 		SIMD::Int packed[4];
 		// Round up texel size: for formats smaller than 32 bits per texel, we will emit a bunch
 		// of (overlapping) 32b loads here, and each lane will pick out what it needs from the low bits.
 		// TODO: specialize for small formats?
 		for (auto i = 0; i < (texelSize + 3)/4; i++)
 		{
-			packed[i] = SIMD::Load<SIMD::Int>(texelPtr, state->robust, state->activeLaneMask(), false, std::memory_order_relaxed, std::min(texelSize, 4));
+			packed[i] = SIMD::Load<SIMD::Int>(texelPtr, robustness, state->activeLaneMask(), false, std::memory_order_relaxed, std::min(texelSize, 4));
 			texelPtr += sizeof(float);
 		}
 
@@ -5587,9 +5665,12 @@
 		auto basePtr = SIMD::Pointer(imageBase, imageSizeInBytes);
 		auto texelPtr = GetTexelAddress(state, basePtr, coordinate, imageType, binding, texelSize, 0, false);
 
+		// SPIR-V 1.4: "If the coordinates are outside the image, the memory location that is accessed is undefined."
+		auto robustness = OutOfBoundsBehavior::UndefinedValue;
+
 		for (auto i = 0u; i < numPackedElements; i++)
 		{
-			SIMD::Store(texelPtr, packed[i], state->robust, state->activeLaneMask());
+			SIMD::Store(texelPtr, packed[i], robustness, state->activeLaneMask());
 			texelPtr += sizeof(float);
 		}
 
@@ -5778,8 +5859,11 @@
 			if (dstInterleavedByLane) { dst = interleaveByLane(dst); }
 			if (srcInterleavedByLane) { src = interleaveByLane(src); }
 
-			auto value = SIMD::Load<SIMD::Float>(src, state->robust, state->activeLaneMask());
-			SIMD::Store(dst, value, state->robust, state->activeLaneMask());
+			// TODO(b/131224163): Optimize based on src/dst storage classes.
+			auto robustness = OutOfBoundsBehavior::RobustBufferAccess;
+
+			auto value = SIMD::Load<SIMD::Float>(src, robustness, state->activeLaneMask());
+			SIMD::Store(dst, value, robustness, state->activeLaneMask());
 		});
 		return EmitResult::Continue;
 	}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index a0c07be..7c850ba 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -55,6 +55,14 @@
 	// Forward declarations.
 	class SpirvRoutine;
 
+	enum class OutOfBoundsBehavior
+	{
+		Nullify,             // Loads become zero, stores are elided.
+		RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
+		UndefinedValue,      // Only for load operations. Not secure. No program termination.
+		UndefinedBehavior,   // Program may terminate.
+	};
+
 	// SIMD contains types that represent multiple scalars packed into a single
 	// vector data type. Types in the SIMD namespace provide a semantic hint
 	// that the data should be treated as a per-execution-lane scalar instead of
@@ -137,11 +145,11 @@
 				return dynamicOffsets + SIMD::Int(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
 			}
 
-			inline SIMD::Int isInBounds(unsigned int accessSize) const
+			inline SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
 			{
 				ASSERT(accessSize > 0);
 
-				if (isStaticAllInBounds(accessSize))
+				if (isStaticallyInBounds(accessSize, robustness))
 				{
 					return SIMD::Int(0xffffffff);
 				}
@@ -160,12 +168,31 @@
 				return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
 			}
 
-			inline bool isStaticAllInBounds(unsigned int accessSize) const
+			inline bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
 			{
-				if (hasDynamicOffsets || hasDynamicLimit)
+				if (hasDynamicOffsets)
 				{
 					return false;
 				}
+
+				if (hasDynamicLimit)
+				{
+					if (hasStaticEqualOffsets() || hasStaticSequentialOffsets(accessSize))
+					{
+						switch(robustness)
+						{
+						case OutOfBoundsBehavior::UndefinedBehavior:
+							// With this robustness setting the application/compiler guarantees in-bounds accesses on active lanes,
+							// but since it can't know in advance which branches are taken this must be true even for inactives lanes.
+							return true;
+						case OutOfBoundsBehavior::Nullify:
+						case OutOfBoundsBehavior::RobustBufferAccess:
+						case OutOfBoundsBehavior::UndefinedValue:
+							return false;
+						}
+					}
+				}
+
 				for (int i = 0; i < SIMD::Width; i++)
 				{
 					if (staticOffsets[i] + accessSize - 1 >= staticLimit)
@@ -173,6 +200,7 @@
 						return false;
 					}
 				}
+
 				return true;
 			}
 
@@ -247,8 +275,8 @@
 			SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
 			std::array<int32_t, SIMD::Width> staticOffsets;
 
-			bool hasDynamicLimit; // True if dynamicLimit is zero.
-			bool hasDynamicOffsets; // True if all dynamicOffsets are zero.
+			bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
+			bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
 		};
 
 		template <typename T> struct Element {};
@@ -257,16 +285,16 @@
 		template <> struct Element<UInt>  { using type = rr::UInt; };
 
 		template<typename T>
-		void Store(Pointer ptr, T val, bool robust, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+		void Store(Pointer ptr, T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
 
 		template<typename T>
-		void Store(Pointer ptr, RValue<T> val, bool robust, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed)
+		void Store(Pointer ptr, RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed)
 		{
-			Store(ptr, T(val), robust, mask, atomic, order);
+			Store(ptr, T(val), robustness, mask, atomic, order);
 		}
 
 		template<typename T>
-		T Load(Pointer ptr, bool robust, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+		T Load(Pointer ptr, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
 	}
 
 	// Incrementally constructed complex bundle of rvalues
@@ -850,6 +878,7 @@
 		Function::ID entryPoint;
 
 		const bool robustBufferAccess = true;
+		spv::ExecutionModel executionModel = spv::ExecutionModelMax; // Invalid prior to OpEntryPoint parsing.
 
 		// DeclareType creates a Type for the given OpTypeX instruction, storing
 		// it into the types map. It is called from the analysis pass (constructor).
@@ -934,13 +963,16 @@
 					Function::ID function,
 					RValue<SIMD::Int> activeLaneMask,
 					const vk::DescriptorSet::Bindings &descriptorSets,
-					bool robustBufferAccess)
+					bool robustBufferAccess,
+					spv::ExecutionModel executionModel)
 				: routine(routine),
 				  function(function),
 				  activeLaneMaskValue(activeLaneMask.value),
 				  descriptorSets(descriptorSets),
-				  robust(robustBufferAccess)
+				  robustBufferAccess(robustBufferAccess),
+				  executionModel(executionModel)
 			{
+				ASSERT(executionModelToStage(executionModel) != VkShaderStageFlagBits(0));  // Must parse OpEntryPoint before emitting.
 			}
 
 			RValue<SIMD::Int> activeLaneMask() const
@@ -975,7 +1007,7 @@
 
 			const vk::DescriptorSet::Bindings &descriptorSets;
 
-			const bool robust = true;  // Emit robustBufferAccess safe code.
+			OutOfBoundsBehavior getOutOfBoundsBehavior(spv::StorageClass storageClass) const;
 
 			Intermediate& createIntermediate(Object::ID id, uint32_t size)
 			{
@@ -1005,9 +1037,13 @@
 				ASSERT_MSG(it != pointers.end(), "Unknown pointer %d", id.value());
 				return it->second;
 			}
+
 		private:
 			std::unordered_map<Object::ID, Intermediate> intermediates;
 			std::unordered_map<Object::ID, SIMD::Pointer> pointers;
+
+			const bool robustBufferAccess = true;  // Emit robustBufferAccess safe code.
+			const spv::ExecutionModel executionModel = spv::ExecutionModelMax;
 		};
 
 		// EmitResult is an enumerator of result values from the Emit functions.
@@ -1196,13 +1232,15 @@
 		std::pair<SIMD::Float, SIMD::Int> Frexp(RValue<SIMD::Float> val) const;
 
 		static ImageSampler *getImageSampler(uint32_t instruction, vk::SampledImageDescriptor const *imageDescriptor, const vk::Sampler *sampler);
-		static ImageSampler *emitSamplerFunction(ImageInstruction instruction, const Sampler &samplerState);
+		static std::shared_ptr<rr::Routine> emitSamplerRoutine(ImageInstruction instruction, const Sampler &samplerState);
 
 		// TODO(b/129523279): Eliminate conversion and use vk::Sampler members directly.
 		static sw::TextureType convertTextureType(VkImageViewType imageViewType);
 		static sw::FilterType convertFilterMode(const vk::Sampler *sampler);
 		static sw::MipmapType convertMipmapMode(const vk::Sampler *sampler);
 		static sw::AddressingMode convertAddressingMode(int coordinateIndex, VkSamplerAddressMode addressMode, VkImageViewType imageViewType);
+
+		// Returns 0 when invalid.
 		static VkShaderStageFlagBits executionModelToStage(spv::ExecutionModel model);
 	};
 
diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index e02c32a..5e56977 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp
@@ -16,13 +16,11 @@
 
 #include "SamplerCore.hpp" // TODO: Figure out what's needed.
 #include "System/Math.hpp"
-#include "Vulkan/VkBuffer.hpp"
 #include "Vulkan/VkDebug.hpp"
-#include "Vulkan/VkDescriptorSet.hpp"
-#include "Vulkan/VkPipelineLayout.hpp"
+#include "Vulkan/VkDescriptorSetLayout.hpp"
+#include "Vulkan/VkDevice.hpp"
 #include "Vulkan/VkImageView.hpp"
 #include "Vulkan/VkSampler.hpp"
-#include "Vulkan/VkDescriptorSetLayout.hpp"
 #include "Device/Config.hpp"
 
 #include <spirv/unified1/spirv.hpp>
@@ -31,31 +29,6 @@
 #include <climits>
 #include <mutex>
 
-namespace
-{
-
-struct SamplingRoutineKey
-{
-	uint32_t instruction;
-	uint32_t sampler;
-	uint32_t imageView;
-
-	bool operator==(const SamplingRoutineKey &rhs) const
-	{
-		return instruction == rhs.instruction && sampler == rhs.sampler && imageView == rhs.imageView;
-	}
-
-	struct Hash
-	{
-		std::size_t operator()(const SamplingRoutineKey &key) const noexcept
-		{
-			return (key.instruction << 16) ^ (key.sampler << 8) ^ key.imageView;
-		}
-	};
-};
-
-}
-
 namespace sw {
 
 SpirvShader::ImageSampler *SpirvShader::getImageSampler(uint32_t inst, vk::SampledImageDescriptor const *imageDescriptor, const vk::Sampler *sampler)
@@ -63,15 +36,24 @@
 	ImageInstruction instruction(inst);
 	ASSERT(imageDescriptor->imageViewId != 0 && (sampler->id != 0 || instruction.samplerMethod == Fetch));
 
-	// TODO(b/129523279): Move somewhere sensible.
-	static std::unordered_map<SamplingRoutineKey, ImageSampler*, SamplingRoutineKey::Hash> cache;
-	static std::mutex mutex;
+	vk::Device::SamplingRoutineCache::Key key = {inst, imageDescriptor->imageViewId, sampler->id};
 
-	SamplingRoutineKey key = {inst, imageDescriptor->imageViewId, sampler->id};
+	ASSERT(imageDescriptor->device);
 
-	std::unique_lock<std::mutex> lock(mutex);
-	auto it = cache.find(key);
-	if (it != cache.end()) { return it->second; }
+	auto routine = imageDescriptor->device->findInConstCache(key);
+	if(routine)
+	{
+		return (ImageSampler*)(routine->getEntry());
+	}
+
+	std::unique_lock<std::mutex> lock(imageDescriptor->device->getSamplingRoutineCacheMutex());
+	vk::Device::SamplingRoutineCache* cache = imageDescriptor->device->getSamplingRoutineCache();
+
+	routine = cache->query(key);
+	if(routine)
+	{
+		return (ImageSampler*)(routine->getEntry());
+	}
 
 	auto type = imageDescriptor->type;
 
@@ -108,13 +90,13 @@
 		UNSUPPORTED("anisotropyEnable");
 	}
 
-	auto fptr = emitSamplerFunction(instruction, samplerState);
+	routine = emitSamplerRoutine(instruction, samplerState);
 
-	cache.emplace(key, fptr);
-	return fptr;
+	cache->add(key, routine);
+	return (ImageSampler*)(routine->getEntry());
 }
 
-SpirvShader::ImageSampler *SpirvShader::emitSamplerFunction(ImageInstruction instruction, const Sampler &samplerState)
+std::shared_ptr<rr::Routine> SpirvShader::emitSamplerRoutine(ImageInstruction instruction, const Sampler &samplerState)
 {
 	// TODO(b/129523279): Hold a separate mutex lock for the sampler being built.
 	rr::Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<SIMD::Float>, Pointer<SIMD::Float>, Pointer<Byte>)> function;
@@ -231,7 +213,7 @@
 		}
 	}
 
-	return (ImageSampler*)function(vk::ReactorOptimizationLevel, "sampler")->getEntry();
+	return function("sampler");
 }
 
 sw::TextureType SpirvShader::convertTextureType(VkImageViewType imageViewType)
diff --git a/src/Reactor/BUILD.gn b/src/Reactor/BUILD.gn
index 94a66da..ccde7db 100644
--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -46,7 +46,6 @@
     "Debug.cpp",
     "ExecutableMemory.cpp",
     "Reactor.cpp",
-    "Routine.cpp",
   ]
 }
 
diff --git a/src/Reactor/Coroutine.hpp b/src/Reactor/Coroutine.hpp
index 993e7e0..aaf08ae 100644
--- a/src/Reactor/Coroutine.hpp
+++ b/src/Reactor/Coroutine.hpp
@@ -133,7 +133,7 @@
 		// called without building a new rr::Function or rr::Coroutine.
 		// While automatically called by operator(), finalize() should be called
 		// as early as possible to release the global Reactor mutex lock.
-		inline void finalize(OptimizationLevel optLevel = OptimizationLevel::Default);
+		inline void finalize(const Config::Edit &cfg = Config::Edit::None);
 
 		// Starts execution of the coroutine and returns a unique_ptr to a
 		// Stream<> that exposes the await() function for obtaining yielded
@@ -164,11 +164,11 @@
 	}
 
 	template<typename Return, typename... Arguments>
-	void Coroutine<Return(Arguments...)>::finalize(OptimizationLevel optLevel /* = OptimizationLevel::Default */)
+	void Coroutine<Return(Arguments...)>::finalize(const Config::Edit &cfg /* = Config::Edit::None */)
 	{
 		if(core != nullptr)
 		{
-			routine.reset(core->acquireCoroutine("coroutine", optLevel));
+			routine = core->acquireCoroutine("coroutine", cfg);
 			core.reset(nullptr);
 		}
 	}
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index c2c7ae7..106ac35 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -105,46 +105,125 @@
 
 namespace
 {
-	class LLVMInitializer
+	// Default configuration settings. Must be accessed under mutex lock.
+	std::mutex defaultConfigLock;
+	rr::Config &defaultConfig()
 	{
-	protected:
-		LLVMInitializer()
-		{
-			llvm::InitializeNativeTarget();
-			llvm::InitializeNativeTargetAsmPrinter();
-			llvm::InitializeNativeTargetAsmParser();
-		}
+		// This uses a static in a function to avoid the cost of a global static
+		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+		static rr::Config config = rr::Config::Edit()
+			.set(rr::Optimization::Level::Default)
+			.add(rr::Optimization::Pass::ScalarReplAggregates)
+			.add(rr::Optimization::Pass::InstructionCombining)
+			.apply({});
+		return config;
+	}
+
+	// Cache provides a simple, thread-safe key-value store.
+	template <typename KEY, typename VALUE>
+	class Cache
+	{
+	public:
+		Cache() = default;
+		Cache(const Cache& other);
+		VALUE getOrCreate(KEY key, std::function<VALUE()> create);
+	private:
+		mutable std::mutex mutex; // mutable required for copy constructor.
+		std::unordered_map<KEY, VALUE> map;
 	};
 
+	template <typename KEY, typename VALUE>
+	Cache<KEY, VALUE>::Cache(const Cache& other)
+	{
+		std::unique_lock<std::mutex> lock(other.mutex);
+		map = other.map;
+	}
+
+	template <typename KEY, typename VALUE>
+	VALUE Cache<KEY, VALUE>::getOrCreate(KEY key, std::function<VALUE()> create)
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		auto it = map.find(key);
+		if (it != map.end())
+		{
+			return it->second;
+		}
+		auto value = create();
+		map.emplace(key, value);
+		return value;
+	}
+
 	// JITGlobals is a singleton that holds all the immutable machine specific
 	// information for the host device.
-	class JITGlobals : LLVMInitializer
+	class JITGlobals
 	{
 	public:
-		static JITGlobals const * get();
+		using TargetMachineSPtr = std::shared_ptr<llvm::TargetMachine>;
 
-		std::string mcpu;
-		std::vector<std::string> mattrs;
-		const char* march;
-		llvm::TargetOptions targetOptions;
-		llvm::DataLayout dataLayout = llvm::DataLayout("");
+		static JITGlobals * get();
+
+		const std::string mcpu;
+		const std::vector<std::string> mattrs;
+		const char* const march;
+		const llvm::TargetOptions targetOptions;
+		const llvm::DataLayout dataLayout;
+
+		TargetMachineSPtr getTargetMachine(rr::Optimization::Level optlevel);
 
 	private:
-		JITGlobals();
+		static JITGlobals create();
+		static llvm::CodeGenOpt::Level toLLVM(rr::Optimization::Level level);
+		JITGlobals(const char *mcpu,
+		           const std::vector<std::string> &mattrs,
+		           const char *march,
+		           const llvm::TargetOptions &targetOptions,
+		           const llvm::DataLayout &dataLayout);
+		JITGlobals(const JITGlobals&) = default;
+
+		// The cache key here is actually a rr::Optimization::Level. We use int
+		// as 'enum class' types do not provide builtin hash functions until
+		// C++14. See: https://stackoverflow.com/a/29618545.
+		Cache<int, TargetMachineSPtr> targetMachines;
 	};
 
-	JITGlobals const * JITGlobals::get()
+	JITGlobals * JITGlobals::get()
 	{
-		static JITGlobals instance;
+		static JITGlobals instance = create();
 		return &instance;
 	}
 
-	JITGlobals::JITGlobals()
+	JITGlobals::TargetMachineSPtr JITGlobals::getTargetMachine(rr::Optimization::Level optlevel)
 	{
-		// mcpu
-		mcpu = llvm::sys::getHostCPUName();
+		return targetMachines.getOrCreate(static_cast<int>(optlevel), [&]() {
+			return TargetMachineSPtr(llvm::EngineBuilder()
+#ifdef ENABLE_RR_DEBUG_INFO
+				.setOptLevel(toLLVM(rr::Optimization::Level::None))
+#else
+				.setOptLevel(toLLVM(optlevel))
+#endif // ENABLE_RR_DEBUG_INFO
+				.setMCPU(mcpu)
+				.setMArch(march)
+				.setMAttrs(mattrs)
+				.setTargetOptions(targetOptions)
+				.selectTarget());
+		});
+	}
 
-		// mattrs
+	JITGlobals JITGlobals::create()
+	{
+		struct LLVMInitializer
+		{
+			LLVMInitializer()
+			{
+				llvm::InitializeNativeTarget();
+				llvm::InitializeNativeTargetAsmPrinter();
+				llvm::InitializeNativeTargetAsmParser();
+			}
+		};
+		static LLVMInitializer initializeLLVM;
+
+		auto mcpu = llvm::sys::getHostCPUName();
+
 		llvm::StringMap<bool> features;
 		bool ok = llvm::sys::getHostCPUFeatures(features);
 
@@ -155,31 +234,13 @@
 		(void) ok; // getHostCPUFeatures always returns false on other platforms
 #endif
 
+		std::vector<std::string> mattrs;
 		for (auto &feature : features)
 		{
 			if (feature.second) { mattrs.push_back(feature.first()); }
 		}
 
-#if 0
-#if defined(__i386__) || defined(__x86_64__)
-		mattrs.push_back(CPUID::supportsMMX()    ? "+mmx"    : "-mmx");
-		mattrs.push_back(CPUID::supportsCMOV()   ? "+cmov"   : "-cmov");
-		mattrs.push_back(CPUID::supportsSSE()    ? "+sse"    : "-sse");
-		mattrs.push_back(CPUID::supportsSSE2()   ? "+sse2"   : "-sse2");
-		mattrs.push_back(CPUID::supportsSSE3()   ? "+sse3"   : "-sse3");
-		mattrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3"  : "-ssse3");
-		mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse4.1" : "-sse4.1");
-#elif defined(__arm__)
-#if __ARM_ARCH >= 8
-		mattrs.push_back("+armv8-a");
-#else
-		// armv7-a requires compiler-rt routines; otherwise, compiled kernel
-		// might fail to link.
-#endif
-#endif
-#endif
-
-		// arch
+		const char* march = nullptr;
 #if defined(__x86_64__)
 		march = "x86-64";
 #elif defined(__i386__)
@@ -200,9 +261,8 @@
 		#error "unknown architecture"
 #endif
 
+		llvm::TargetOptions targetOptions;
 		targetOptions.UnsafeFPMath = false;
-		// targetOpts.NoInfsFPMath = true;
-		// targetOpts.NoNaNsFPMath = true;
 
 		auto targetMachine = std::unique_ptr<llvm::TargetMachine>(
 			llvm::EngineBuilder()
@@ -213,7 +273,35 @@
 				.setTargetOptions(targetOptions)
 				.selectTarget());
 
-		dataLayout = targetMachine->createDataLayout();
+		auto dataLayout = targetMachine->createDataLayout();
+
+		return JITGlobals(mcpu.data(), mattrs, march, targetOptions, dataLayout);
+	}
+
+	llvm::CodeGenOpt::Level JITGlobals::toLLVM(rr::Optimization::Level level)
+	{
+		switch (level)
+		{
+			case rr::Optimization::Level::None:       return ::llvm::CodeGenOpt::None;
+			case rr::Optimization::Level::Less:       return ::llvm::CodeGenOpt::Less;
+			case rr::Optimization::Level::Default:    return ::llvm::CodeGenOpt::Default;
+			case rr::Optimization::Level::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
+			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
+		}
+		return ::llvm::CodeGenOpt::Default;
+	}
+
+	JITGlobals::JITGlobals(const char* mcpu,
+	                       const std::vector<std::string> &mattrs,
+	                       const char* march,
+	                       const llvm::TargetOptions &targetOptions,
+	                       const llvm::DataLayout &dataLayout) :
+			mcpu(mcpu),
+			mattrs(mattrs),
+			march(march),
+			targetOptions(targetOptions),
+			dataLayout(dataLayout)
+	{
 	}
 
 	// JITRoutine is a rr::Routine that holds a LLVM JIT session, compiler and
@@ -228,7 +316,7 @@
 				std::unique_ptr<llvm::Module> module,
 				llvm::Function **funcs,
 				size_t count,
-				rr::OptimizationLevel optLevel) :
+				const rr::Config &config) :
 			resolver(createLegacyLookupResolver(
 				session,
 				[&](const std::string &name) {
@@ -247,17 +335,7 @@
 						return;
 					}
 				})),
-			targetMachine(llvm::EngineBuilder()
-#ifdef ENABLE_RR_DEBUG_INFO
-				.setOptLevel(llvm::CodeGenOpt::None)
-#else
-				.setOptLevel(toLLVM(optLevel))
-#endif // ENABLE_RR_DEBUG_INFO
-				.setMCPU(JITGlobals::get()->mcpu)
-				.setMArch(JITGlobals::get()->march)
-				.setMAttrs(JITGlobals::get()->mattrs)
-				.setTargetOptions(JITGlobals::get()->targetOptions)
-				.selectTarget()),
+			targetMachine(JITGlobals::get()->getTargetMachine(config.getOptimization().getLevel())),
 			compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
 			objLayer(
 				session,
@@ -318,21 +396,8 @@
 		}
 
 	private:
-		static ::llvm::CodeGenOpt::Level toLLVM(rr::OptimizationLevel level)
-		{
-			switch (level)
-			{
-				case rr::OptimizationLevel::None:       return ::llvm::CodeGenOpt::None;
-				case rr::OptimizationLevel::Less:       return ::llvm::CodeGenOpt::Less;
-				case rr::OptimizationLevel::Default:    return ::llvm::CodeGenOpt::Default;
-				case rr::OptimizationLevel::Aggressive: return ::llvm::CodeGenOpt::Aggressive;
-				default: UNREACHABLE("Unknown OptimizationLevel %d", int(level));
-			}
-			return ::llvm::CodeGenOpt::Default;
-		}
-
 		std::shared_ptr<llvm::orc::SymbolResolver> resolver;
-		std::unique_ptr<llvm::TargetMachine> targetMachine;
+		std::shared_ptr<llvm::TargetMachine> targetMachine;
 		llvm::orc::ExecutionSession session;
 		CompileLayer compileLayer;
 		ObjLayer objLayer;
@@ -343,15 +408,17 @@
 	class JITBuilder
 	{
 	public:
-		JITBuilder():
+		JITBuilder(const rr::Config &config) :
+			config(config),
 			module(new llvm::Module("", context)),
 			builder(new llvm::IRBuilder<>(context))
 		{
 			module->setDataLayout(JITGlobals::get()->dataLayout);
 		}
 
-		void optimize()
+		void optimize(const rr::Config &cfg)
 		{
+
 #ifdef ENABLE_RR_DEBUG_INFO
 			if (debugInfo != nullptr)
 			{
@@ -362,36 +429,36 @@
 			std::unique_ptr<llvm::legacy::PassManager> passManager(
 				new llvm::legacy::PassManager());
 
-			passManager->add(llvm::createSROAPass());
-
-			for(int pass = 0; pass < 10 && rr::optimization[pass] != rr::Disabled; pass++)
+			for(auto pass : cfg.getOptimization().getPasses())
 			{
-				switch(rr::optimization[pass])
+				switch(pass)
 				{
-				case rr::Disabled:                                                                       break;
-				case rr::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
-				case rr::LICM:                 passManager->add(llvm::createLICMPass());                 break;
-				case rr::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
-				case rr::GVN:                  passManager->add(llvm::createGVNPass());                  break;
-				case rr::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
-				case rr::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
-				case rr::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
-				case rr::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
-				case rr::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
+				case rr::Optimization::Pass::Disabled:                                                                       break;
+				case rr::Optimization::Pass::CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
+				case rr::Optimization::Pass::LICM:                 passManager->add(llvm::createLICMPass());                 break;
+				case rr::Optimization::Pass::AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
+				case rr::Optimization::Pass::GVN:                  passManager->add(llvm::createGVNPass());                  break;
+				case rr::Optimization::Pass::InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
+				case rr::Optimization::Pass::Reassociate:          passManager->add(llvm::createReassociatePass());          break;
+				case rr::Optimization::Pass::DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
+				case rr::Optimization::Pass::SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
+				case rr::Optimization::Pass::ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
+				case rr::Optimization::Pass::EarlyCSEPass:         passManager->add(llvm::createEarlyCSEPass());             break;
 				default:
-					UNREACHABLE("optimization[pass]: %d, pass: %d", int(rr::optimization[pass]), int(pass));
+					UNREACHABLE("pass: %d", int(pass));
 				}
 			}
 
 			passManager->run(*module);
 		}
 
-		rr::Routine *acquireRoutine(llvm::Function **funcs, size_t count, rr::OptimizationLevel optLevel)
+		std::shared_ptr<rr::Routine> acquireRoutine(llvm::Function **funcs, size_t count, const rr::Config &cfg)
 		{
 			ASSERT(module);
-			return new JITRoutine(std::move(module), funcs, count, optLevel);
+			return std::make_shared<JITRoutine>(std::move(module), funcs, count, cfg);
 		}
 
+		const rr::Config config;
 		llvm::LLVMContext context;
 		std::unique_ptr<llvm::Module> module;
 		std::unique_ptr<llvm::IRBuilder<>> builder;
@@ -404,6 +471,8 @@
 			llvm::Value *handle = nullptr;
 			llvm::Value *id = nullptr;
 			llvm::Value *promise = nullptr;
+			llvm::Type *yieldType = nullptr;
+			llvm::BasicBlock *entryBlock = nullptr;
 			llvm::BasicBlock *suspendBlock = nullptr;
 			llvm::BasicBlock *endBlock = nullptr;
 			llvm::BasicBlock *destroyBlock = nullptr;
@@ -1002,8 +1071,6 @@
 		return it->second;
 	}
 
-	Optimization optimization[10] = {InstructionCombining, Disabled};
-
 	// The abstract Type* types are implemented as LLVM types, except that
 	// 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
 	// and VFP in ARM, and eliminate the overhead of converting them to explicit
@@ -1134,7 +1201,7 @@
 		::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 
 		ASSERT(jit == nullptr);
-		jit.reset(new JITBuilder());
+		jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
 	}
 
 	Nucleus::~Nucleus()
@@ -1143,8 +1210,29 @@
 		::codegenMutex.unlock();
 	}
 
-	Routine *Nucleus::acquireRoutine(const char *name, OptimizationLevel optimizationLevel)
+	void Nucleus::setDefaultConfig(const Config &cfg)
 	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		::defaultConfig() = cfg;
+	}
+
+	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		auto &config = ::defaultConfig();
+		config = cfgEdit.apply(config);
+	}
+
+	Config Nucleus::getDefaultConfig()
+	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		return ::defaultConfig();
+	}
+
+	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
+	{
+		auto cfg = cfgEdit.apply(jit->config);
+
 		if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
 		{
 			llvm::Type *type = jit->function->getReturnType();
@@ -1181,7 +1269,7 @@
 		}
 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
 
-		optimize();
+		jit->optimize(cfg);
 
 		if(false)
 		{
@@ -1190,17 +1278,12 @@
 			jit->module->print(file, 0);
 		}
 
-		auto routine = jit->acquireRoutine(&jit->function, 1, optimizationLevel);
+		auto routine = jit->acquireRoutine(&jit->function, 1, cfg);
 		jit.reset();
 
 		return routine;
 	}
 
-	void Nucleus::optimize()
-	{
-		jit->optimize();
-	}
-
 	Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
 	{
 		// Need to allocate it in the entry block for mem2reg to work
@@ -1246,7 +1329,7 @@
 		jit->function = rr::createFunction("", T(ReturnType), T(Params));
 
 #ifdef ENABLE_RR_DEBUG_INFO
-		jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder, jit->context, jit->module.get(), jit->function));
+		jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
 #endif // ENABLE_RR_DEBUG_INFO
 
 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
@@ -4462,22 +4545,19 @@
 		SuspendActionDestroy = 1
 	};
 
-} // anonymous namespace
 
-namespace rr {
-
-void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
+void promoteFunctionToCoroutine()
 {
+	ASSERT(jit->coroutine.id == nullptr);
+
 	// Types
 	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
 	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
 	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
-	auto promiseTy = T(YieldType);
+	auto promiseTy = jit->coroutine.yieldType;
 	auto promisePtrTy = promiseTy->getPointerTo();
-	auto handleTy = i8PtrTy;
-	auto boolTy = i1Ty;
 
 	// LLVM intrinsics
 	auto coro_id = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
@@ -4496,6 +4576,8 @@
 	auto freeFrameTy = ::llvm::FunctionType::get(voidTy, {i8PtrTy}, false);
 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
 
+	auto oldInsertionPoint = jit->builder->saveIP();
+
 	// Build the coroutine_await() function:
 	//
 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
@@ -4512,7 +4594,6 @@
 	//        }
 	//    }
 	//
-	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, {handleTy, promisePtrTy});
 	{
 		auto args = jit->coroutine.await->arg_begin();
 		auto handle = args++;
@@ -4543,7 +4624,6 @@
 	//        llvm.coro.destroy(handle);
 	//    }
 	//
-	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, {handleTy});
 	{
 		auto handle = jit->coroutine.destroy->arg_begin();
 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
@@ -4583,20 +4663,17 @@
 	//        return handle;
 	//    }
 	//
-	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
 
 #ifdef ENABLE_RR_DEBUG_INFO
-	jit->debugInfo = std::unique_ptr<DebugInfo>(new DebugInfo(jit->builder, jit->context, jit->module, jit->function));
+	jit->debugInfo = std::unique_ptr<rr::DebugInfo>(new rr::DebugInfo(jit->builder.get(), &jit->context, jit->module.get(), jit->function));
 #endif // ENABLE_RR_DEBUG_INFO
 
-	auto entryBlock = llvm::BasicBlock::Create(jit->context, "coroutine", jit->function);
 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(jit->context, "suspend", jit->function);
 	jit->coroutine.endBlock = llvm::BasicBlock::Create(jit->context, "end", jit->function);
 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(jit->context, "destroy", jit->function);
 
-	jit->builder->SetInsertPoint(entryBlock);
-	Variable::materializeAll();
-	jit->coroutine.promise = jit->builder->CreateAlloca(T(YieldType), nullptr, "promise");
+	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
+	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
 		::llvm::ConstantInt::get(i32Ty, 0),
 		jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
@@ -4628,13 +4705,45 @@
 	jit->builder->CreateCall(freeFrame, {memory});
 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
 
-	// Switch back to the entry block for reactor codegen.
-	jit->builder->SetInsertPoint(entryBlock);
+	// Switch back to original insert point to continue building the coroutine.
+	jit->builder->restoreIP(oldInsertionPoint);
+}
+
+} // anonymous namespace
+
+namespace rr {
+
+void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params)
+{
+	// Coroutines are initially created as a regular function.
+	// Upon the first call to Yield(), the function is promoted to a true
+	// coroutine.
+	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
+	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
+	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
+	auto handleTy = i8PtrTy;
+	auto boolTy = i1Ty;
+	auto promiseTy = T(YieldType);
+	auto promisePtrTy = promiseTy->getPointerTo();
+
+	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
+	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, {handleTy, promisePtrTy});
+	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, {handleTy});
+	jit->coroutine.yieldType = promiseTy;
+	jit->coroutine.entryBlock = llvm::BasicBlock::Create(jit->context, "function", jit->function);
+
+	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
 }
 
 void Nucleus::yield(Value* val)
 {
-	ASSERT_MSG(jit->coroutine.id != nullptr, "yield() can only be called when building a Coroutine");
+	if (jit->coroutine.id == nullptr)
+	{
+		// First call to yield().
+		// Promote the function to a full coroutine.
+		promoteFunctionToCoroutine();
+		ASSERT(jit->coroutine.id != nullptr);
+	}
 
 	//      promise = val;
 	//
@@ -4678,11 +4787,26 @@
 	jit->builder->SetInsertPoint(resumeBlock);
 }
 
-Routine* Nucleus::acquireCoroutine(const char *name, OptimizationLevel optimizationLevel)
+std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
 {
-	ASSERT_MSG(jit->coroutine.id != nullptr, "acquireCoroutine() called without a call to createCoroutine()");
-
-	jit->builder->CreateBr(jit->coroutine.endBlock);
+	bool isCoroutine = jit->coroutine.id != nullptr;
+	if (isCoroutine)
+	{
+		jit->builder->CreateBr(jit->coroutine.endBlock);
+	}
+	else
+	{
+		// Coroutine without a Yield acts as a regular function.
+		// The 'coroutine_begin' function returns a nullptr for the coroutine
+		// handle.
+		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
+		// The 'coroutine_await' function always returns false (coroutine done).
+		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.await));
+		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
+		// The 'coroutine_destroy' does nothing, returns void.
+		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
+		jit->builder->CreateRetVoid();
+	}
 
 #ifdef ENABLE_RR_DEBUG_INFO
 	if (jit->debugInfo != nullptr)
@@ -4698,16 +4822,28 @@
 		jit->module->print(file, 0);
 	}
 
-	// Run manadory coroutine transforms.
-	llvm::legacy::PassManager pm;
-	pm.add(llvm::createCoroEarlyPass());
-	pm.add(llvm::createCoroSplitPass());
-	pm.add(llvm::createCoroElidePass());
-	pm.add(llvm::createBarrierNoopPass());
-	pm.add(llvm::createCoroCleanupPass());
-	pm.run(*jit->module);
+	if (isCoroutine)
+	{
+		// Run manadory coroutine transforms.
+		llvm::legacy::PassManager pm;
+		pm.add(llvm::createCoroEarlyPass());
+		pm.add(llvm::createCoroSplitPass());
+		pm.add(llvm::createCoroElidePass());
+		pm.add(llvm::createBarrierNoopPass());
+		pm.add(llvm::createCoroCleanupPass());
+		pm.run(*jit->module);
+	}
 
-	optimize();
+#if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
+	{
+		llvm::legacy::PassManager pm;
+		pm.add(llvm::createVerifierPass());
+		pm.run(*jit->module);
+	}
+#endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
+
+	auto cfg = cfgEdit.apply(jit->config);
+	jit->optimize(cfg);
 
 	if(false)
 	{
@@ -4720,7 +4856,7 @@
 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
-	auto routine = jit->acquireRoutine(funcs, Nucleus::CoroutineEntryCount, optimizationLevel);
+	auto routine = jit->acquireRoutine(funcs, Nucleus::CoroutineEntryCount, cfg);
 	jit.reset();
 
 	return routine;
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 819d100..cc20e27 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -15,11 +15,12 @@
 #ifndef rr_Nucleus_hpp
 #define rr_Nucleus_hpp
 
+#include <atomic>
 #include <cassert>
 #include <cstdarg>
 #include <cstdint>
+#include <memory>
 #include <vector>
-#include <atomic>
 
 #ifdef None
 #undef None  // b/127920555
@@ -33,30 +34,86 @@
 	class BasicBlock;
 	class Routine;
 
-	enum Optimization
+	// Optimization holds the optimization settings for code generation.
+	class Optimization
 	{
-		Disabled             = 0,
-		InstructionCombining = 1,
-		CFGSimplification    = 2,
-		LICM                 = 3,
-		AggressiveDCE        = 4,
-		GVN                  = 5,
-		Reassociate          = 6,
-		DeadStoreElimination = 7,
-		SCCP                 = 8,
-		ScalarReplAggregates = 9,
+	public:
+		enum class Level
+		{
+			None,
+			Less,
+			Default,
+			Aggressive,
+		};
 
-		OptimizationCount
+		enum class Pass
+		{
+			Disabled,
+			InstructionCombining,
+			CFGSimplification,
+			LICM,
+			AggressiveDCE,
+			GVN,
+			Reassociate,
+			DeadStoreElimination,
+			SCCP,
+			ScalarReplAggregates,
+			EarlyCSEPass,
+
+			Count,
+		};
+
+		using Passes = std::vector<Pass>;
+
+		Optimization() = default;
+		Optimization(Level level, const Passes & passes) : level(level), passes(passes) {}
+
+		Level getLevel() const { return level; }
+		const Passes & getPasses() const { return passes; }
+
+	private:
+		Level level = Level::Default;
+		Passes passes;
 	};
 
-	extern Optimization optimization[10];
-
-	enum class OptimizationLevel
+	// Config holds the Reactor configuration settings.
+	class Config
 	{
-		None,
-		Less,
-		Default,
-		Aggressive,
+	public:
+		// Edit holds a number of modifications to a config, that can be applied
+		// on an existing Config to produce a new Config with the specified
+		// changes.
+		class Edit
+		{
+		public:
+			static const Edit None;
+
+			Edit & set(Optimization::Level level) { optLevel = level; optLevelChanged = true; return *this; }
+			Edit & add(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Add, pass}); return *this; }
+			Edit & remove(Optimization::Pass pass) { optPassEdits.push_back({ListEdit::Remove, pass}); return *this; }
+			Edit & clearOptimizationPasses() { optPassEdits.push_back({ListEdit::Clear, Optimization::Pass::Disabled}); return *this; }
+
+			Config apply(const Config &cfg) const;
+
+		private:
+			enum class ListEdit { Add, Remove, Clear };
+			using OptPassesEdit = std::pair<ListEdit, Optimization::Pass>;
+
+			template <typename T>
+			void apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const;
+
+			Optimization::Level optLevel;
+			bool optLevelChanged = false;
+			std::vector<OptPassesEdit> optPassEdits;
+		};
+
+		Config() = default;
+		Config(const Optimization & optimization) : optimization(optimization) {}
+
+		const Optimization & getOptimization() const { return optimization; }
+
+	private:
+		Optimization optimization;
 	};
 
 	class Nucleus
@@ -66,7 +123,13 @@
 
 		virtual ~Nucleus();
 
-		Routine *acquireRoutine(const char *name, OptimizationLevel optimizationLevel);
+		// Default configuration to use when no other configuration is specified.
+		// The new configuration will be applied to subsequent reactor calls.
+		static void setDefaultConfig(const Config &cfg);
+		static void adjustDefaultConfig(const Config::Edit &cfgEdit);
+		static Config getDefaultConfig();
+
+		std::shared_ptr<Routine> acquireRoutine(const char *name, const Config::Edit &cfgEdit = Config::Edit::None);
 
 		static Value *allocateStackVariable(Type *type, int arraySize = 0);
 		static BasicBlock *createBasicBlock();
@@ -93,7 +156,7 @@
 		};
 
 		static void createCoroutine(Type *ReturnType, std::vector<Type*> &Params);
-		Routine *acquireCoroutine(const char *name, OptimizationLevel optimizationLevel);
+		std::shared_ptr<Routine> acquireCoroutine(const char *name, const Config::Edit &cfg = Config::Edit::None);
 		static void yield(Value*);
 
 		// Terminators
@@ -219,9 +282,6 @@
 		static Value *createConstantVector(const double *constants, Type *type);
 
 		static Type *getPointerType(Type *elementType);
-
-	private:
-		void optimize();
 	};
 }
 
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index bb94cf5..60ee656 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -21,8 +21,59 @@
 #define REACTOR_MATERIALIZE_LVALUES_ON_DEFINITION 0
 #endif
 
+namespace
+{
+	// Introduced in C++20.
+	template <class ForwardIterator, class UnaryPredicate>
+	ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
+								UnaryPredicate pred)
+	{
+		ForwardIterator result = first;
+		while (first!=last) {
+			if (!pred(*first)) {
+				*result = std::move(*first);
+				++result;
+			}
+			++first;
+		}
+		return result;
+	}
+}
+
 namespace rr
 {
+	const Config::Edit Config::Edit::None = {};
+
+	Config Config::Edit::apply(const Config &cfg) const
+	{
+		if (this == &None) { return cfg; }
+
+		auto level = optLevelChanged ? optLevel : cfg.optimization.getLevel();
+		auto passes = cfg.optimization.getPasses();
+		apply(optPassEdits, passes);
+		return Config{ Optimization{level, passes} };
+	}
+
+	template <typename T>
+	void rr::Config::Edit::apply(const std::vector<std::pair<ListEdit, T>> & edits, std::vector<T>& list) const
+	{
+		for (auto & edit : edits)
+		{
+			switch (edit.first)
+			{
+			case ListEdit::Add:
+				list.push_back(edit.second);
+				break;
+			case ListEdit::Remove:
+				::remove_if(list.begin(), list.end(), [&](T item) { return item == edit.second; });
+				break;
+			case ListEdit::Clear:
+				list.clear();
+				break;
+			}
+		}
+	}
+
 	// Set of variables that do not have a stack location yet.
 	std::unordered_set<Variable*> Variable::unmaterializedVariables;
 
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 1391275..5add9cd 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2464,8 +2464,8 @@
 			return Argument<typename std::tuple_element<index, std::tuple<Arguments...>>::type>(arg);
 		}
 
-		Routine *operator()(const char *name, ...);
-		Routine *operator()(OptimizationLevel optLevel, const char *name, ...);
+		std::shared_ptr<Routine> operator()(const char *name, ...);
+		std::shared_ptr<Routine> operator()(const Config::Edit &cfg, const char *name, ...);
 
 	protected:
 		Nucleus *core;
@@ -3031,7 +3031,7 @@
 	}
 
 	template<typename Return, typename... Arguments>
-	Routine *Function<Return(Arguments...)>::operator()(const char *name, ...)
+	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const char *name, ...)
 	{
 		char fullName[1024 + 1];
 
@@ -3040,11 +3040,11 @@
 		vsnprintf(fullName, 1024, name, vararg);
 		va_end(vararg);
 
-		return core->acquireRoutine(fullName, OptimizationLevel::Default);
+		return core->acquireRoutine(fullName, Config::Edit::None);
 	}
 
 	template<typename Return, typename... Arguments>
-	Routine *Function<Return(Arguments...)>::operator()(OptimizationLevel optLevel, const char *name, ...)
+	std::shared_ptr<Routine> Function<Return(Arguments...)>::operator()(const Config::Edit &cfg, const char *name, ...)
 	{
 		char fullName[1024 + 1];
 
@@ -3053,7 +3053,7 @@
 		vsnprintf(fullName, 1024, name, vararg);
 		va_end(vararg);
 
-		return core->acquireRoutine(fullName, optLevel);
+		return core->acquireRoutine(fullName, cfg);
 	}
 
 	template<class T, class S>
diff --git a/src/Reactor/Reactor.vcxproj b/src/Reactor/Reactor.vcxproj
index 5326d2c..a885861 100644
--- a/src/Reactor/Reactor.vcxproj
+++ b/src/Reactor/Reactor.vcxproj
@@ -290,7 +290,6 @@
     <ClCompile Include="LLVMReactorDebugInfo.cpp" />

     <ClCompile Include="ExecutableMemory.cpp" />

     <ClCompile Include="Reactor.cpp" />

-    <ClCompile Include="Routine.cpp" />

     <ClCompile Include="Thread.cpp" />

   </ItemGroup>

   <ItemGroup>

diff --git a/src/Reactor/Reactor.vcxproj.filters b/src/Reactor/Reactor.vcxproj.filters
index f66a728..c7efa0b 100644
--- a/src/Reactor/Reactor.vcxproj.filters
+++ b/src/Reactor/Reactor.vcxproj.filters
@@ -15,9 +15,6 @@
     </Filter>

   </ItemGroup>

   <ItemGroup>

-    <ClCompile Include="Routine.cpp">

-      <Filter>Source Files</Filter>

-    </ClCompile>

     <ClCompile Include="LLVMReactor.cpp">

       <Filter>Source Files</Filter>

     </ClCompile>

diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 490433e..9bc1227c 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -38,7 +38,7 @@
 
 TEST(ReactorUnitTests, Sample)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Int>, Int)> function;
@@ -73,12 +73,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Uninitialized)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int()> function;
@@ -110,12 +109,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Unreachable)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Int)> function;
@@ -141,12 +139,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, VariableAddress)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Int)> function;
@@ -169,12 +166,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>, Pointer<Byte>)> function;
@@ -229,12 +225,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, VectorConstant)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -278,12 +273,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Concatenate)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -321,12 +315,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Swizzle)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -448,12 +441,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Branching)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Void)> function;
@@ -513,12 +505,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, MinMax)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -604,12 +595,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, NotNeg)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -689,12 +679,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, VectorCompare)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -751,12 +740,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, SaturatedAddAndSubtract)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -864,12 +852,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Unpack)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>,Pointer<Byte>)> function;
@@ -911,12 +898,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Pack)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -973,12 +959,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, MulHigh)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -1050,12 +1035,11 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, MulAdd)
 {
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function<Int(Pointer<Byte>)> function;
@@ -1086,7 +1070,6 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Call)
@@ -1097,7 +1080,7 @@
 		return;
 	}
 
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	struct Class
 	{
@@ -1137,7 +1120,6 @@
 		}
 	}
 
-	delete routine;
 }
 
 // Check that a complex generated function which utilizes all 8 or 16 XMM
@@ -1148,7 +1130,7 @@
 // It's necessary to inspect the registers in a debugger to actually verify.)
 TEST(ReactorUnitTests, PreserveXMMRegisters)
 {
-    Routine *routine = nullptr;
+    std::shared_ptr<Routine> routine;
 
     {
         Function<Void(Pointer<Byte>, Pointer<Byte>)> function;
@@ -1225,7 +1207,6 @@
         EXPECT_EQ(result[3], 0.0f);
     }
 
-    delete routine;
 }
 
 template <typename T>
@@ -1255,7 +1236,7 @@
 	using CType = typename TestFixture::CType;
 	using ReactorType = typename TestFixture::ReactorType;
 
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function< Int(ReactorType) > function;
@@ -1277,7 +1258,6 @@
 		}
 	}
 
-	delete routine;
 }
 
 template <typename T>
@@ -1327,7 +1307,7 @@
 	using CType = typename TestFixture::CType;
 	using ReactorType = typename TestFixture::ReactorType;
 
-	Routine *routine = nullptr;
+	std::shared_ptr<Routine> routine;
 
 	{
 		Function< Pointer<ReactorType>(Pointer<ReactorType>, Int) > function;
@@ -1367,7 +1347,6 @@
 		}
 	}
 
-	delete routine;
 }
 
 TEST(ReactorUnitTests, Coroutines_Fibonacci)
diff --git a/src/Reactor/ReactorUnitTests.vcxproj b/src/Reactor/ReactorUnitTests.vcxproj
index d5bd456..606cc1a 100644
--- a/src/Reactor/ReactorUnitTests.vcxproj
+++ b/src/Reactor/ReactorUnitTests.vcxproj
@@ -126,6 +126,7 @@
       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -170,6 +171,7 @@
       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

diff --git a/src/Reactor/Routine.cpp b/src/Reactor/Routine.cpp
deleted file mode 100644
index 23cf929..0000000
--- a/src/Reactor/Routine.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "Routine.hpp"
-
-#include "Thread.hpp"
-
-#include <cassert>
-
-namespace rr
-{
-	Routine::Routine()
-	{
-		bindCount = 0;
-	}
-
-	void Routine::bind()
-	{
-		atomicIncrement(&bindCount);
-	}
-
-	void Routine::unbind()
-	{
-		long count = atomicDecrement(&bindCount);
-
-		if(count == 0)
-		{
-			delete this;
-		}
-	}
-
-	Routine::~Routine()
-	{
-		assert(bindCount == 0);
-	}
-}
diff --git a/src/Reactor/Routine.hpp b/src/Reactor/Routine.hpp
index 0158bcc..67560e8 100644
--- a/src/Reactor/Routine.hpp
+++ b/src/Reactor/Routine.hpp
@@ -20,18 +20,10 @@
 	class Routine
 	{
 	public:
-		Routine();
-
-		virtual ~Routine();
+		Routine() = default;
+		virtual ~Routine() = default;
 
 		virtual const void *getEntry(int index = 0) = 0;
-
-		// Reference counting
-		void bind();
-		void unbind();
-
-	private:
-		volatile int bindCount;
 	};
 }
 
diff --git a/src/Reactor/Subzero.vcxproj b/src/Reactor/Subzero.vcxproj
index 8866f16..6b6490d 100644
--- a/src/Reactor/Subzero.vcxproj
+++ b/src/Reactor/Subzero.vcxproj
@@ -102,6 +102,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -122,6 +123,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -144,6 +146,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -172,6 +175,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -226,7 +230,6 @@
     <ClCompile Include="ExecutableMemory.cpp" />

     <ClCompile Include="Optimizer.cpp" />

     <ClCompile Include="Reactor.cpp" />

-    <ClCompile Include="Routine.cpp" />

     <ClCompile Include="SubzeroReactor.cpp" />

   </ItemGroup>

   <ItemGroup>

diff --git a/src/Reactor/Subzero.vcxproj.filters b/src/Reactor/Subzero.vcxproj.filters
index 7229339..5b9bb2d 100644
--- a/src/Reactor/Subzero.vcxproj.filters
+++ b/src/Reactor/Subzero.vcxproj.filters
@@ -102,9 +102,6 @@
     <ClCompile Include="SubzeroReactor.cpp">

       <Filter>Source Files</Filter>

     </ClCompile>

-    <ClCompile Include="Routine.cpp">

-      <Filter>Source Files</Filter>

-    </ClCompile>

     <ClCompile Include="$(SolutionDir)third_party\subzero\src\IceInstX8632.cpp">

       <Filter>Source Files</Filter>

     </ClCompile>

diff --git a/src/Reactor/SubzeroLLVMDependencies.vcxproj b/src/Reactor/SubzeroLLVMDependencies.vcxproj
index dbae0f8..38d7598 100644
--- a/src/Reactor/SubzeroLLVMDependencies.vcxproj
+++ b/src/Reactor/SubzeroLLVMDependencies.vcxproj
@@ -96,6 +96,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

   </ItemDefinitionGroup>

   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">

@@ -109,6 +110,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

   </ItemDefinitionGroup>

   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">

@@ -124,6 +126,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <EnableCOMDATFolding>true</EnableCOMDATFolding>

@@ -143,6 +146,7 @@
       <MultiProcessorCompilation>true</MultiProcessorCompilation>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <EnableCOMDATFolding>true</EnableCOMDATFolding>

diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index bcc2f7e..0cf2370 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -55,6 +55,18 @@
 
 namespace
 {
+	// Default configuration settings. Must be accessed under mutex lock.
+	std::mutex defaultConfigLock;
+	rr::Config &defaultConfig()
+	{
+		// This uses a static in a function to avoid the cost of a global static
+		// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
+		static rr::Config config = rr::Config::Edit()
+			.set(rr::Optimization::Level::Default)
+			.apply({});
+		return config;
+	}
+
 	Ice::GlobalContext *context = nullptr;
 	Ice::Cfg *function = nullptr;
 	Ice::CfgNode *basicBlock = nullptr;
@@ -77,6 +89,19 @@
 		#define __x86_64__ 1
 	#endif
 
+	static Ice::OptLevel toIce(rr::Optimization::Level level)
+	{
+		switch (level)
+		{
+			case rr::Optimization::Level::None:       return Ice::Opt_0;
+			case rr::Optimization::Level::Less:       return Ice::Opt_1;
+			case rr::Optimization::Level::Default:    return Ice::Opt_2;
+			case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
+			default: UNREACHABLE("Unknown Optimization Level %d", int(level));
+		}
+		return Ice::Opt_2;
+	}
+
 	class CPUID
 	{
 	public:
@@ -204,8 +229,6 @@
 		return Ice::typeWidthInBytes(T(type));
 	}
 
-	Optimization optimization[10] = {InstructionCombining, Disabled};
-
 	using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
 	using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
 
@@ -548,7 +571,7 @@
 			Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
 		#endif
 		Flags.setOutFileType(Ice::FT_Elf);
-		Flags.setOptLevel(Ice::Opt_2);
+		Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
 		Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
 		Flags.setVerbose(false ? Ice::IceV_Most : Ice::IceV_None);
 		Flags.setDisableHybridAssembly(true);
@@ -585,7 +608,26 @@
 		::codegenMutex.unlock();
 	}
 
-	Routine *Nucleus::acquireRoutine(const char *name, OptimizationLevel optimizationLevel)
+	void Nucleus::setDefaultConfig(const Config &cfg)
+	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		::defaultConfig() = cfg;
+	}
+
+	void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
+	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		auto &config = ::defaultConfig();
+		config = cfgEdit.apply(config);
+	}
+
+	Config Nucleus::getDefaultConfig()
+	{
+		std::unique_lock<std::mutex> lock(::defaultConfigLock);
+		return ::defaultConfig();
+	}
+
+	std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
 	{
 		if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
 		{
@@ -594,7 +636,7 @@
 
 		::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
 
-		optimize();
+		rr::optimize(::function);
 
 		::function->translate();
 		ASSERT(!::function->hasError());
@@ -621,12 +663,7 @@
 		Routine *handoffRoutine = ::routine;
 		::routine = nullptr;
 
-		return handoffRoutine;
-	}
-
-	void Nucleus::optimize()
-	{
-		rr::optimize(::function);
+		return std::shared_ptr<Routine>(handoffRoutine);
 	}
 
 	Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
@@ -3506,7 +3543,7 @@
 	void FlushDebug() {}
 
 	void Nucleus::createCoroutine(Type *YieldType, std::vector<Type*> &Params) { UNIMPLEMENTED("createCoroutine"); }
-	Routine* Nucleus::acquireCoroutine(const char *name, OptimizationLevel optimizationLevel) { UNIMPLEMENTED("acquireCoroutine"); return nullptr; }
+	std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */) { UNIMPLEMENTED("acquireCoroutine"); return nullptr; }
 	void Nucleus::yield(Value* val) { UNIMPLEMENTED("Yield"); }
 
 }
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index 30ef1e9..d4edbfa 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -1179,7 +1179,7 @@
 		return s;
 	}
 
-	Routine *Blitter::generate(const State &state)
+	std::shared_ptr<Routine> Blitter::generate(const State &state)
 	{
 		Function<Void(Pointer<Byte>)> function;
 		{
@@ -1420,7 +1420,7 @@
 		state.destSamples = dest->getSamples();
 
 		criticalSection.lock();
-		Routine *blitRoutine = blitCache->query(state);
+		auto blitRoutine = blitCache->query(state);
 
 		if(!blitRoutine)
 		{
diff --git a/src/Renderer/Blitter.hpp b/src/Renderer/Blitter.hpp
index e3db745..9c6b4c0 100644
--- a/src/Renderer/Blitter.hpp
+++ b/src/Renderer/Blitter.hpp
@@ -111,7 +111,7 @@
 		static Float4 LinearToSRGB(Float4 &color);
 		static Float4 sRGBtoLinear(Float4 &color);
 		bool blitReactor(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, const Options &options);
-		Routine *generate(const State &state);
+		std::shared_ptr<Routine> generate(const State &state);
 
 		RoutineCache<State> *blitCache;
 		MutexLock criticalSection;
diff --git a/src/Renderer/LRUCache.hpp b/src/Renderer/LRUCache.hpp
index 1a1a302..bdd0950 100644
--- a/src/Renderer/LRUCache.hpp
+++ b/src/Renderer/LRUCache.hpp
@@ -27,9 +27,9 @@
 
 		~LRUCache();
 
-		Data *query(const Key &key) const;
-		Data *add(const Key &key, Data *data);
-	
+		Data query(const Key &key) const;
+		Data add(const Key &key, const Data &data);
+
 		int getSize() {return size;}
 		Key &getKey(int i) {return key[i];}
 
@@ -41,7 +41,7 @@
 
 		Key *key;
 		Key **ref;
-		Data **data;
+		Data *data;
 	};
 }
 
@@ -57,12 +57,10 @@
 
 		key = new Key[size];
 		ref = new Key*[size];
-		data = new Data*[size];
+		data = new Data[size];
 
 		for(int i = 0; i < size; i++)
 		{
-			data[i] = nullptr;
-
 			ref[i] = &key[i];
 		}
 	}
@@ -76,21 +74,12 @@
 		delete[] ref;
 		ref = nullptr;
 
-		for(int i = 0; i < size; i++)
-		{
-			if(data[i])
-			{
-				data[i]->unbind();
-				data[i] = nullptr;
-			}
-		}
-
 		delete[] data;
 		data = nullptr;
 	}
 
 	template<class Key, class Data>
-	Data *LRUCache<Key, Data>::query(const Key &key) const
+	Data LRUCache<Key, Data>::query(const Key &key) const
 	{
 		for(int i = top; i > top - fill; i--)
 		{
@@ -98,14 +87,14 @@
 
 			if(key == *ref[j])
 			{
-				Data *hit = data[j];
+				Data hit = data[j];
 
 				if(i != top)
 				{
 					// Move one up
 					int k = (j + 1) & mask;
 
-					Data *swapD = data[k];
+					Data swapD = data[k];
 					data[k] = data[j];
 					data[j] = swapD;
 
@@ -122,20 +111,12 @@
 	}
 
 	template<class Key, class Data>
-	Data *LRUCache<Key, Data>::add(const Key &key, Data *data)
+	Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
 	{
 		top = (top + 1) & mask;
 		fill = fill + 1 < size ? fill + 1 : size;
 
 		*ref[top] = key;
-
-		data->bind();
-
-		if(this->data[top])
-		{
-			this->data[top]->unbind();
-		}
-
 		this->data[top] = data;
 
 		return data;
diff --git a/src/Renderer/PixelProcessor.cpp b/src/Renderer/PixelProcessor.cpp
index 133f90c..0b80727 100644
--- a/src/Renderer/PixelProcessor.cpp
+++ b/src/Renderer/PixelProcessor.cpp
@@ -1182,9 +1182,9 @@
 		return state;
 	}
 
-	Routine *PixelProcessor::routine(const State &state)
+	std::shared_ptr<Routine> PixelProcessor::routine(const State &state)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)
 		{
diff --git a/src/Renderer/PixelProcessor.hpp b/src/Renderer/PixelProcessor.hpp
index 98300de..4fa627c 100644
--- a/src/Renderer/PixelProcessor.hpp
+++ b/src/Renderer/PixelProcessor.hpp
@@ -306,7 +306,7 @@
 
 	protected:
 		const State update() const;
-		Routine *routine(const State &state);
+		std::shared_ptr<Routine> routine(const State &state);
 		void setRoutineCacheSize(int routineCacheSize);
 
 		// Shader constants
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index 87b8dd1..c3c2260 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -360,10 +360,6 @@
 			draw->drawType = drawType;
 			draw->batchSize = batch;
 
-			vertexRoutine->bind();
-			setupRoutine->bind();
-			pixelRoutine->bind();
-
 			draw->vertexRoutine = vertexRoutine;
 			draw->setupRoutine = setupRoutine;
 			draw->pixelRoutine = pixelRoutine;
@@ -1105,9 +1101,9 @@
 					}
 				}
 
-				draw.vertexRoutine->unbind();
-				draw.setupRoutine->unbind();
-				draw.pixelRoutine->unbind();
+				draw.vertexRoutine.reset();
+				draw.setupRoutine.reset();
+				draw.pixelRoutine.reset();
 
 				sync->unlock();
 
@@ -2010,12 +2006,6 @@
 		P[3].y -= Y;
 		C[3] = clipper->computeClipFlags(P[3]);
 
-		triangle.v1 = triangle.v0;
-		triangle.v2 = triangle.v0;
-
-		triangle.v1.X += iround(16 * 0.5f * pSize);
-		triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
-
 		Polygon polygon(P, 4);
 
 		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
@@ -2030,6 +2020,11 @@
 				}
 			}
 
+			triangle.v1 = triangle.v0;
+			triangle.v2 = triangle.v0;
+
+			triangle.v1.X += iround(16 * 0.5f * pSize);
+			triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 			return setupRoutine(&primitive, &triangle, &polygon, &data);
 		}
 
@@ -2855,10 +2850,13 @@
 			CPUID::setEnableSSE2(configuration.enableSSE2);
 			CPUID::setEnableSSE(configuration.enableSSE);
 
-			for(int pass = 0; pass < 10; pass++)
+			rr::Config::Edit cfg;
+			cfg.clearOptimizationPasses();
+			for(auto pass : configuration.optimization)
 			{
-				optimization[pass] = configuration.optimization[pass];
+				if (pass != rr::Optimization::Pass::Disabled) { cfg.add(pass); }
 			}
+			rr::Nucleus::adjustDefaultConfig(cfg);
 
 			forceWindowed = configuration.forceWindowed;
 			complementaryDepthBuffer = configuration.complementaryDepthBuffer;
diff --git a/src/Renderer/Renderer.hpp b/src/Renderer/Renderer.hpp
index 1118c59..4ed11f6 100644
--- a/src/Renderer/Renderer.hpp
+++ b/src/Renderer/Renderer.hpp
@@ -458,9 +458,9 @@
 		SetupProcessor::State setupState;
 		PixelProcessor::State pixelState;
 
-		Routine *vertexRoutine;
-		Routine *setupRoutine;
-		Routine *pixelRoutine;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
 	};
 
 	struct DrawCall
@@ -472,9 +472,9 @@
 		AtomicInt drawType;
 		AtomicInt batchSize;
 
-		Routine *vertexRoutine;
-		Routine *setupRoutine;
-		Routine *pixelRoutine;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
 
 		VertexProcessor::RoutinePointer vertexPointer;
 		SetupProcessor::RoutinePointer setupPointer;
diff --git a/src/Renderer/RoutineCache.hpp b/src/Renderer/RoutineCache.hpp
index 8420468..61f635a 100644
--- a/src/Renderer/RoutineCache.hpp
+++ b/src/Renderer/RoutineCache.hpp
@@ -24,7 +24,7 @@
 	using namespace rr;
 
 	template<class State>
-	using RoutineCache = LRUCache<State, Routine>;
+	using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
 }
 
 #endif   // sw_RoutineCache_hpp
diff --git a/src/Renderer/SetupProcessor.cpp b/src/Renderer/SetupProcessor.cpp
index 7211406..d8b9b91 100644
--- a/src/Renderer/SetupProcessor.cpp
+++ b/src/Renderer/SetupProcessor.cpp
@@ -223,9 +223,9 @@
 		return state;
 	}
 
-	Routine *SetupProcessor::routine(const State &state)
+	std::shared_ptr<Routine> SetupProcessor::routine(const State &state)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)
 		{
diff --git a/src/Renderer/SetupProcessor.hpp b/src/Renderer/SetupProcessor.hpp
index be0adc7..de12afd 100644
--- a/src/Renderer/SetupProcessor.hpp
+++ b/src/Renderer/SetupProcessor.hpp
@@ -91,7 +91,7 @@
 
 	protected:
 		State update() const;
-		Routine *routine(const State &state);
+		std::shared_ptr<Routine> routine(const State &state);
 
 		void setRoutineCacheSize(int cacheSize);
 
diff --git a/src/Renderer/VertexProcessor.cpp b/src/Renderer/VertexProcessor.cpp
index 463393a..9bd786e 100644
--- a/src/Renderer/VertexProcessor.cpp
+++ b/src/Renderer/VertexProcessor.cpp
@@ -1088,9 +1088,9 @@
 		return state;
 	}
 
-	Routine *VertexProcessor::routine(const State &state)
+	std::shared_ptr<Routine> VertexProcessor::routine(const State &state)
 	{
-		Routine *routine = routineCache->query(state);
+		auto routine = routineCache->query(state);
 
 		if(!routine)   // Create one
 		{
diff --git a/src/Renderer/VertexProcessor.hpp b/src/Renderer/VertexProcessor.hpp
index b53263b..329bdac 100644
--- a/src/Renderer/VertexProcessor.hpp
+++ b/src/Renderer/VertexProcessor.hpp
@@ -284,7 +284,7 @@
 		const Matrix &getViewTransform();
 
 		const State update(DrawType drawType);
-		Routine *routine(const State &state);
+		std::shared_ptr<Routine> routine(const State &state);
 
 		bool isFixedFunction();
 		void setRoutineCacheSize(int cacheSize);
diff --git a/src/Shader/SetupRoutine.cpp b/src/Shader/SetupRoutine.cpp
index 6024869..4f2955c 100644
--- a/src/Shader/SetupRoutine.cpp
+++ b/src/Shader/SetupRoutine.cpp
@@ -665,7 +665,7 @@
 		#endif
 	}
 
-	Routine *SetupRoutine::getRoutine()
+	std::shared_ptr<Routine> SetupRoutine::getRoutine()
 	{
 		return routine;
 	}
diff --git a/src/Shader/SetupRoutine.hpp b/src/Shader/SetupRoutine.hpp
index c1c3205..0f34249 100644
--- a/src/Shader/SetupRoutine.hpp
+++ b/src/Shader/SetupRoutine.hpp
@@ -30,7 +30,7 @@
 		virtual ~SetupRoutine();
 
 		void generate();
-		Routine *getRoutine();
+		std::shared_ptr<Routine> getRoutine();
 
 	private:
 		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, bool wrap, int component);
@@ -40,7 +40,7 @@
 
 		const SetupProcessor::State &state;
 
-		Routine *routine;
+		std::shared_ptr<Routine> routine;
 	};
 }
 
diff --git a/src/SwiftShader/SwiftShader.vcxproj b/src/SwiftShader/SwiftShader.vcxproj
index c772c28..78b8925 100644
--- a/src/SwiftShader/SwiftShader.vcxproj
+++ b/src/SwiftShader/SwiftShader.vcxproj
@@ -131,7 +131,7 @@
       <BrowseInformation>true</BrowseInformation>

       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <RuntimeTypeInfo>false</RuntimeTypeInfo>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

@@ -167,7 +167,7 @@
       <BrowseInformation>true</BrowseInformation>

       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <RuntimeTypeInfo>false</RuntimeTypeInfo>

       <TreatWarningAsError>true</TreatWarningAsError>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

@@ -200,7 +200,7 @@
       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>

       </DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <ForcedIncludeFiles>%(ForcedIncludeFiles)</ForcedIncludeFiles>

       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>

       <StringPooling>true</StringPooling>

@@ -236,7 +236,7 @@
       </PrecompiledHeader>

       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <ForcedIncludeFiles>%(ForcedIncludeFiles)</ForcedIncludeFiles>

       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>

       <StringPooling>true</StringPooling>

@@ -276,7 +276,7 @@
       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>

       </DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <ForcedIncludeFiles>%(ForcedIncludeFiles)</ForcedIncludeFiles>

       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>

       <StringPooling>true</StringPooling>

@@ -315,7 +315,7 @@
       </PrecompiledHeader>

       <WarningLevel>Level3</WarningLevel>

       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>

-      <DisableSpecificWarnings>5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

+      <DisableSpecificWarnings>4267;5030;%(DisableSpecificWarnings)</DisableSpecificWarnings>

       <ForcedIncludeFiles>%(ForcedIncludeFiles)</ForcedIncludeFiles>

       <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>

       <StringPooling>true</StringPooling>

diff --git a/src/System/Memory.cpp b/src/System/Memory.cpp
index 663732f..e045254 100644
--- a/src/System/Memory.cpp
+++ b/src/System/Memory.cpp
@@ -31,6 +31,7 @@
 #endif
 
 #include <cstring>
+#include <cstdlib>
 
 #undef allocate
 #undef deallocate
@@ -70,7 +71,7 @@
 			return allocation;
 		}
 	#else
-		unsigned char *block = new unsigned char[bytes + sizeof(Allocation) + alignment];
+		unsigned char *block = (unsigned char*)malloc(bytes + sizeof(Allocation) + alignment);
 		unsigned char *aligned = nullptr;
 
 		if(block)
@@ -127,7 +128,7 @@
 			unsigned char *aligned = (unsigned char*)memory;
 			Allocation *allocation = (Allocation*)(aligned - sizeof(Allocation));
 
-			delete[] allocation->block;
+			free(allocation->block);
 		}
 	#endif
 }
diff --git a/src/Vulkan/BUILD.gn b/src/Vulkan/BUILD.gn
index 6aeaec8..3fd343f 100644
--- a/src/Vulkan/BUILD.gn
+++ b/src/Vulkan/BUILD.gn
@@ -108,7 +108,13 @@
   output_name = "libvulkan"
   output_dir = "$root_out_dir/swiftshader"
 
-  if (is_linux) {
+  if (is_mac) {
+    ldflags = [
+      "-Wl,-install_name,@rpath/libvk_swiftshader.dylib",
+      "-Wl,-exported_symbols_list," +
+          rebase_path("libvk_swiftshader.exports", root_build_dir),
+    ]
+  } else if (is_linux) {
     inputs = [
       "libvk_swiftshader.lds",
     ]
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index 7c4edac..d787886 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -1491,7 +1491,7 @@
 
 	for(uint32_t i = 0; i < rangeCount; i++)
 	{
-		addCommand<ClearColorImage>(image, pColor[i], pRanges[i]);
+		addCommand<ClearColorImage>(image, *pColor, pRanges[i]);
 	}
 }
 
@@ -1502,7 +1502,7 @@
 
 	for(uint32_t i = 0; i < rangeCount; i++)
 	{
-		addCommand<ClearDepthStencilImage>(image, pDepthStencil[i], pRanges[i]);
+		addCommand<ClearDepthStencilImage>(image, *pDepthStencil, pRanges[i]);
 	}
 }
 
diff --git a/src/Vulkan/VkConfig.h b/src/Vulkan/VkConfig.h
index adaa353..157f34e 100644
--- a/src/Vulkan/VkConfig.h
+++ b/src/Vulkan/VkConfig.h
@@ -17,8 +17,6 @@
 
 #include "Version.h"
 
-#include "Reactor/Nucleus.hpp" // ReactorOptimizationLevel
-
 #include <Vulkan/VulkanPlatform.h>
 
 namespace vk
@@ -79,9 +77,6 @@
 	MAX_POINT_SIZE = 1,		// Large points are not supported. If/when we turn this on, must be >= 64.
 };
 
-// Optimization level to use for JIT functions.
-static constexpr auto ReactorOptimizationLevel = rr::OptimizationLevel::Default;
-
 }
 
 #endif // VK_CONFIG_HPP_
diff --git a/src/Vulkan/VkDescriptorPool.cpp b/src/Vulkan/VkDescriptorPool.cpp
index 18a9d16..79b46cc 100644
--- a/src/Vulkan/VkDescriptorPool.cpp
+++ b/src/Vulkan/VkDescriptorPool.cpp
@@ -51,12 +51,12 @@
 
 size_t DescriptorPool::ComputeRequiredAllocationSize(const VkDescriptorPoolCreateInfo* pCreateInfo)
 {
-	size_t size = pCreateInfo->maxSets * sizeof(DescriptorSetHeader);
+	size_t size = pCreateInfo->maxSets * sw::align(sizeof(DescriptorSetHeader), 16);
 
 	for(uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++)
 	{
 		size += pCreateInfo->pPoolSizes[i].descriptorCount *
-		        DescriptorSetLayout::GetDescriptorSize(pCreateInfo->pPoolSizes[i].type);
+		        sw::align(DescriptorSetLayout::GetDescriptorSize(pCreateInfo->pPoolSizes[i].type), 16);
 	}
 
 	return size;
@@ -101,7 +101,7 @@
 	}
 
 	// Second, look for space at the beginning of the pool
-	const auto itBegin = nodes.end();
+	const auto itBegin = nodes.begin();
 	freeSpace = itBegin->set - pool;
 	if(freeSpace >= size)
 	{
@@ -215,7 +215,7 @@
 	totalFreeSize += poolSize - (itLast->set - pool) + itLast->size;
 
 	// Compute space at the beginning of the pool
-	const auto itBegin = nodes.end();
+	const auto itBegin = nodes.begin();
 	totalFreeSize += itBegin->set - pool;
 
 	// Finally, look between existing pool items
diff --git a/src/Vulkan/VkDescriptorSet.hpp b/src/Vulkan/VkDescriptorSet.hpp
index a733a5b..fc50148 100644
--- a/src/Vulkan/VkDescriptorSet.hpp
+++ b/src/Vulkan/VkDescriptorSet.hpp
@@ -29,7 +29,7 @@
 		DescriptorSetLayout* layout;
 	};
 
-	class DescriptorSet
+	class alignas(16) DescriptorSet
 	{
 	public:
 		static inline DescriptorSet* Cast(VkDescriptorSet object)
diff --git a/src/Vulkan/VkDescriptorSetLayout.cpp b/src/Vulkan/VkDescriptorSetLayout.cpp
index 65e625e..e4d87ce 100644
--- a/src/Vulkan/VkDescriptorSetLayout.cpp
+++ b/src/Vulkan/VkDescriptorSetLayout.cpp
@@ -269,7 +269,7 @@
 	}
 }
 
-void DescriptorSetLayout::WriteDescriptorSet(DescriptorSet *dstSet, VkDescriptorUpdateTemplateEntry const &entry, char const *src)
+void DescriptorSetLayout::WriteDescriptorSet(Device* device, DescriptorSet *dstSet, VkDescriptorUpdateTemplateEntry const &entry, char const *src)
 {
 	DescriptorSetLayout* dstLayout = dstSet->header.layout;
 	auto &binding = dstLayout->bindings[dstLayout->getBindingIndex(entry.dstBinding)];
@@ -294,6 +294,7 @@
 			{
 				imageSampler[i].updateSampler(vk::Cast(update->sampler));
 			}
+			imageSampler[i].device = device;
 		}
 	}
 	else if (entry.descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER)
@@ -319,6 +320,7 @@
 			imageSampler[i].texture.width = sw::replicate(static_cast<float>(numElements));
 			imageSampler[i].texture.height = sw::replicate(1);
 			imageSampler[i].texture.depth = sw::replicate(1);
+			imageSampler[i].device = device;
 
 			sw::Mipmap &mipmap = imageSampler[i].texture.mipmap[0];
 			mipmap.buffer = bufferView->getPointer();
@@ -360,6 +362,7 @@
 			imageSampler[i].type = imageView->getType();
 			imageSampler[i].swizzle = imageView->getComponentMapping();
 			imageSampler[i].format = format;
+			imageSampler[i].device = device;
 
 			auto &subresourceRange = imageView->getSubresourceRange();
 
@@ -572,7 +575,7 @@
 	mipmap.sliceP[3] = sliceP;
 }
 
-void DescriptorSetLayout::WriteDescriptorSet(const VkWriteDescriptorSet& writeDescriptorSet)
+void DescriptorSetLayout::WriteDescriptorSet(Device* device, const VkWriteDescriptorSet& writeDescriptorSet)
 {
 	DescriptorSet* dstSet = vk::Cast(writeDescriptorSet.dstSet);
 	VkDescriptorUpdateTemplateEntry e;
@@ -611,7 +614,7 @@
 		UNIMPLEMENTED("descriptor type %u", writeDescriptorSet.descriptorType);
 	}
 
-	WriteDescriptorSet(dstSet, e, reinterpret_cast<char const *>(ptr));
+	WriteDescriptorSet(device, dstSet, e, reinterpret_cast<char const *>(ptr));
 }
 
 void DescriptorSetLayout::CopyDescriptorSet(const VkCopyDescriptorSet& descriptorCopies)
diff --git a/src/Vulkan/VkDescriptorSetLayout.hpp b/src/Vulkan/VkDescriptorSetLayout.hpp
index 44ac8f2..73535e8 100644
--- a/src/Vulkan/VkDescriptorSetLayout.hpp
+++ b/src/Vulkan/VkDescriptorSetLayout.hpp
@@ -25,6 +25,7 @@
 {
 
 class DescriptorSet;
+class Device;
 
 // TODO(b/129523279): Move to the Device or Pipeline layer.
 struct alignas(16) SampledImageDescriptor
@@ -35,6 +36,7 @@
 
 	// TODO(b/129523279): Minimize to the data actually needed.
 	vk::Sampler sampler;
+	vk::Device* device;
 
 	uint32_t imageViewId;
 	VkImageViewType type;
@@ -84,10 +86,10 @@
 	static size_t ComputeRequiredAllocationSize(const VkDescriptorSetLayoutCreateInfo* pCreateInfo);
 
 	static size_t GetDescriptorSize(VkDescriptorType type);
-	static void WriteDescriptorSet(const VkWriteDescriptorSet& descriptorWrites);
+	static void WriteDescriptorSet(Device* device, const VkWriteDescriptorSet& descriptorWrites);
 	static void CopyDescriptorSet(const VkCopyDescriptorSet& descriptorCopies);
 
-	static void WriteDescriptorSet(DescriptorSet *dstSet, VkDescriptorUpdateTemplateEntry const &entry, char const *src);
+	static void WriteDescriptorSet(Device* device, DescriptorSet *dstSet, VkDescriptorUpdateTemplateEntry const &entry, char const *src);
 	static void WriteTextureLevelInfo(sw::Texture *texture, int level, int width, int height, int depth, int pitchP, int sliceP);
 
 	void initialize(DescriptorSet* descriptorSet);
diff --git a/src/Vulkan/VkDescriptorUpdateTemplate.cpp b/src/Vulkan/VkDescriptorUpdateTemplate.cpp
index 76acbe7..e70ad73 100644
--- a/src/Vulkan/VkDescriptorUpdateTemplate.cpp
+++ b/src/Vulkan/VkDescriptorUpdateTemplate.cpp
@@ -35,14 +35,14 @@
 		return info->descriptorUpdateEntryCount * sizeof(VkDescriptorUpdateTemplateEntry);
 	}
 
-	void DescriptorUpdateTemplate::updateDescriptorSet(VkDescriptorSet vkDescriptorSet, const void* pData)
+	void DescriptorUpdateTemplate::updateDescriptorSet(Device* device, VkDescriptorSet vkDescriptorSet, const void* pData)
 	{
 
 		DescriptorSet* descriptorSet = vk::Cast(vkDescriptorSet);
 
 		for(uint32_t i = 0; i < descriptorUpdateEntryCount; i++)
 		{
-			DescriptorSetLayout::WriteDescriptorSet(descriptorSet, descriptorUpdateEntries[i],
+			DescriptorSetLayout::WriteDescriptorSet(device, descriptorSet, descriptorUpdateEntries[i],
 													reinterpret_cast<char const *>(pData));
 		}
 	}
diff --git a/src/Vulkan/VkDescriptorUpdateTemplate.hpp b/src/Vulkan/VkDescriptorUpdateTemplate.hpp
index 7f0e5be..90a8b96 100644
--- a/src/Vulkan/VkDescriptorUpdateTemplate.hpp
+++ b/src/Vulkan/VkDescriptorUpdateTemplate.hpp
@@ -20,6 +20,7 @@
 namespace vk

 {

 	class DescriptorSetLayout;

+	class Device;

 

 	class DescriptorUpdateTemplate : public Object<DescriptorUpdateTemplate, VkDescriptorUpdateTemplate>

 	{

@@ -28,7 +29,7 @@
 

 		static size_t ComputeRequiredAllocationSize(const VkDescriptorUpdateTemplateCreateInfo* info);

 

-		void updateDescriptorSet(VkDescriptorSet descriptorSet, const void* pData);

+		void updateDescriptorSet(Device* device, VkDescriptorSet descriptorSet, const void* pData);

 

 	private:

 		uint32_t                              descriptorUpdateEntryCount = 0;

diff --git a/src/Vulkan/VkDevice.cpp b/src/Vulkan/VkDevice.cpp
index 6b918cd..230d2d0 100644
--- a/src/Vulkan/VkDevice.cpp
+++ b/src/Vulkan/VkDevice.cpp
@@ -36,6 +36,32 @@
 namespace vk
 {
 
+std::shared_ptr<rr::Routine> Device::SamplingRoutineCache::query(const vk::Device::SamplingRoutineCache::Key& key) const
+{
+	return cache.query(hash(key));
+}
+
+void Device::SamplingRoutineCache::add(const vk::Device::SamplingRoutineCache::Key& key, const std::shared_ptr<rr::Routine>& routine)
+{
+	ASSERT(routine);
+	cache.add(hash(key), routine);
+}
+
+std::shared_ptr<rr::Routine> Device::SamplingRoutineCache::queryConst(const vk::Device::SamplingRoutineCache::Key& key) const
+{
+	return cache.queryConstCache(hash(key));
+}
+
+void Device::SamplingRoutineCache::updateConstCache()
+{
+	cache.updateConstCache();
+}
+
+std::size_t Device::SamplingRoutineCache::hash(const vk::Device::SamplingRoutineCache::Key &key)
+{
+	return (key.instruction << 16) ^ (key.sampler << 8) ^ key.imageView;
+}
+
 Device::Device(const VkDeviceCreateInfo* pCreateInfo, void* mem, PhysicalDevice *physicalDevice, const VkPhysicalDeviceFeatures *enabledFeatures)
 	: physicalDevice(physicalDevice),
 	  queues(reinterpret_cast<Queue*>(mem)),
@@ -55,7 +81,7 @@
 
 		for(uint32_t j = 0; j < queueCreateInfo.queueCount; j++, queueID++)
 		{
-			new (&queues[queueID]) Queue();
+			new (&queues[queueID]) Queue(this);
 		}
 	}
 
@@ -72,7 +98,8 @@
 	}
 
 	// FIXME (b/119409619): use an allocator here so we can control all memory allocations
-	blitter = new sw::Blitter();
+	blitter.reset(new sw::Blitter());
+	samplingRoutineCache.reset(new SamplingRoutineCache());
 }
 
 void Device::destroy(const VkAllocationCallbacks* pAllocator)
@@ -83,8 +110,6 @@
 	}
 
 	vk::deallocate(queues, pAllocator);
-
-	delete blitter;
 }
 
 size_t Device::ComputeRequiredAllocationSize(const VkDeviceCreateInfo* pCreateInfo)
@@ -212,7 +237,7 @@
 {
 	for(uint32_t i = 0; i < descriptorWriteCount; i++)
 	{
-		DescriptorSetLayout::WriteDescriptorSet(pDescriptorWrites[i]);
+		DescriptorSetLayout::WriteDescriptorSet(this, pDescriptorWrites[i]);
 	}
 
 	for(uint32_t i = 0; i < descriptorCopyCount; i++)
@@ -221,4 +246,25 @@
 	}
 }
 
+Device::SamplingRoutineCache* Device::getSamplingRoutineCache() const
+{
+	return samplingRoutineCache.get();
+}
+
+std::shared_ptr<rr::Routine> Device::findInConstCache(const SamplingRoutineCache::Key& key) const
+{
+	return samplingRoutineCache->queryConst(key);
+}
+
+void Device::updateSamplingRoutineConstCache()
+{
+	std::unique_lock<std::mutex> lock(samplingRoutineCacheMutex);
+	samplingRoutineCache->updateConstCache();
+}
+
+std::mutex& Device::getSamplingRoutineCacheMutex()
+{
+	return samplingRoutineCacheMutex;
+}
+
 } // namespace vk
diff --git a/src/Vulkan/VkDevice.hpp b/src/Vulkan/VkDevice.hpp
index 3e262d3..721dda2 100644
--- a/src/Vulkan/VkDevice.hpp
+++ b/src/Vulkan/VkDevice.hpp
@@ -16,6 +16,10 @@
 #define VK_DEVICE_HPP_
 
 #include "VkObject.hpp"
+#include "Device/LRUCache.hpp"
+#include "Reactor/Routine.hpp"
+#include <memory>
+#include <mutex>
 
 namespace sw
 {
@@ -48,19 +52,49 @@
 	void updateDescriptorSets(uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites,
 	                          uint32_t descriptorCopyCount, const VkCopyDescriptorSet* pDescriptorCopies);
 	const VkPhysicalDeviceFeatures &getEnabledFeatures() const { return enabledFeatures; }
-	sw::Blitter* getBlitter() const { return blitter; }
+	sw::Blitter* getBlitter() const { return blitter.get(); }
+
+	class SamplingRoutineCache
+	{
+	public:
+		SamplingRoutineCache() : cache(1024) {}
+		~SamplingRoutineCache() {}
+
+		struct Key
+		{
+			uint32_t instruction;
+			uint32_t sampler;
+			uint32_t imageView;
+		};
+
+		std::shared_ptr<rr::Routine> query(const Key& key) const;
+		void add(const Key& key, const std::shared_ptr<rr::Routine>& routine);
+
+		std::shared_ptr<rr::Routine> queryConst(const Key& key) const;
+		void updateConstCache();
+
+		static std::size_t hash(const Key &key);
+
+	private:
+		sw::LRUConstCache<std::size_t, std::shared_ptr<rr::Routine>> cache;
+	};
+
+	SamplingRoutineCache* getSamplingRoutineCache() const;
+	std::mutex& getSamplingRoutineCacheMutex();
+	std::shared_ptr<rr::Routine> findInConstCache(const SamplingRoutineCache::Key& key) const;
+	void updateSamplingRoutineConstCache();
 
 private:
 	PhysicalDevice *const physicalDevice = nullptr;
 	Queue *const queues = nullptr;
 	uint32_t queueCount = 0;
-
-	const uint32_t enabledExtensionCount = 0;
+	std::unique_ptr<sw::Blitter> blitter;
+	std::unique_ptr<SamplingRoutineCache> samplingRoutineCache;
+	std::mutex samplingRoutineCacheMutex;
+	uint32_t enabledExtensionCount = 0;
 	typedef char ExtensionName[VK_MAX_EXTENSION_NAME_SIZE];
 	ExtensionName* extensions = nullptr;
 	const VkPhysicalDeviceFeatures enabledFeatures = {};
-
-	sw::Blitter* blitter = nullptr;
 };
 
 using DispatchableDevice = DispatchableObject<Device, VkDevice>;
diff --git a/src/Vulkan/VkGetProcAddress.cpp b/src/Vulkan/VkGetProcAddress.cpp
index 4bfb07e..c132e7f 100644
--- a/src/Vulkan/VkGetProcAddress.cpp
+++ b/src/Vulkan/VkGetProcAddress.cpp
@@ -93,6 +93,10 @@
 	MAKE_VULKAN_INSTANCE_ENTRY(vkCreateXlibSurfaceKHR),
 	MAKE_VULKAN_INSTANCE_ENTRY(vkGetPhysicalDeviceXlibPresentationSupportKHR),
 #endif
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+    // VK_MVK_macos_surface
+    MAKE_VULKAN_INSTANCE_ENTRY(vkCreateMacOSSurfaceMVK),
+#endif
 };
 #undef MAKE_VULKAN_INSTANCE_ENTRY
 
diff --git a/src/Vulkan/VkPipeline.cpp b/src/Vulkan/VkPipeline.cpp
index ba1b8db..e6bb5be 100644
--- a/src/Vulkan/VkPipeline.cpp
+++ b/src/Vulkan/VkPipeline.cpp
@@ -262,7 +262,7 @@
 	// TODO(b/119409619): use allocator.
 	auto program = std::make_shared<sw::ComputeProgram>(key.getShader(), key.getLayout(), descriptorSets);
 	program->generate();
-	program->finalize(vk::ReactorOptimizationLevel);
+	program->finalize();
 	return program;
 }
 
diff --git a/src/Vulkan/VkPipelineLayout.cpp b/src/Vulkan/VkPipelineLayout.cpp
index cd47bab..da0d3ae 100644
--- a/src/Vulkan/VkPipelineLayout.cpp
+++ b/src/Vulkan/VkPipelineLayout.cpp
@@ -40,9 +40,10 @@
 	uint32_t dynamicOffsetBase = 0;
 	for (uint32_t i = 0; i < setLayoutCount; i++)
 	{
-		ASSERT_OR_RETURN(dynamicOffsetBase < MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC);
-		dynamicOffsetBases[i] = dynamicOffsetBase;
-		dynamicOffsetBase += setLayouts[i]->getDynamicDescriptorCount();
+		uint32_t dynamicDescriptorCount = setLayouts[i]->getDynamicDescriptorCount();

+		ASSERT_OR_RETURN((dynamicOffsetBase + dynamicDescriptorCount) <= MAX_DESCRIPTOR_SET_COMBINED_BUFFERS_DYNAMIC);

+		dynamicOffsetBases[i] = dynamicOffsetBase;

+		dynamicOffsetBase += dynamicDescriptorCount;
 	}
 }
 
diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index 4c03198..3aee60a 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp
@@ -74,7 +74,7 @@
 namespace vk
 {
 
-Queue::Queue() : renderer()
+Queue::Queue(Device* device) : renderer(device)
 {
 	queueThread = std::thread(TaskLoop, this);
 }
diff --git a/src/Vulkan/VkQueue.hpp b/src/Vulkan/VkQueue.hpp
index cfa462b..2926e10 100644
--- a/src/Vulkan/VkQueue.hpp
+++ b/src/Vulkan/VkQueue.hpp
@@ -31,6 +31,7 @@
 namespace vk
 {
 
+class Device;
 class Fence;
 
 class Queue
@@ -38,7 +39,7 @@
 	VK_LOADER_DATA loaderData = { ICD_LOADER_MAGIC };
 
 public:
-	Queue();
+	Queue(Device* device);
 	~Queue();
 
 	operator VkQueue()
diff --git a/src/Vulkan/libVulkan.cpp b/src/Vulkan/libVulkan.cpp
index fc1e210..ef70b3f 100644
--- a/src/Vulkan/libVulkan.cpp
+++ b/src/Vulkan/libVulkan.cpp
@@ -42,6 +42,10 @@
 #include "VkShaderModule.hpp"
 #include "VkRenderPass.hpp"
 
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+#include "WSI/MacOSSurfaceMVK.h"
+#endif
+
 #ifdef VK_USE_PLATFORM_XLIB_KHR
 #include "WSI/XlibSurfaceKHR.hpp"
 #endif
@@ -54,6 +58,8 @@
 
 #include "WSI/VkSwapchainKHR.hpp"
 
+#include "Reactor/Nucleus.hpp"
+
 #include <algorithm>
 #include <cstring>
 #include <string>
@@ -75,6 +81,34 @@
 	return false;
 }
 
+// setReactorDefaultConfig() sets the default configuration for Vulkan's use of
+// Reactor.
+void setReactorDefaultConfig()
+{
+	auto cfg = rr::Config::Edit()
+		.set(rr::Optimization::Level::Default)
+		.clearOptimizationPasses()
+		.add(rr::Optimization::Pass::ScalarReplAggregates)
+		.add(rr::Optimization::Pass::SCCP)
+		.add(rr::Optimization::Pass::CFGSimplification)
+		.add(rr::Optimization::Pass::EarlyCSEPass)
+		.add(rr::Optimization::Pass::CFGSimplification)
+		.add(rr::Optimization::Pass::InstructionCombining);
+
+	rr::Nucleus::adjustDefaultConfig(cfg);
+}
+
+// initializeLibrary() is called by vkCreateInstance() to perform one-off global
+// initialization of the swiftshader driver.
+void initializeLibrary()
+{
+	static bool doOnce = [] {
+		setReactorDefaultConfig();
+		return true;
+	}();
+	(void)doOnce;
+}
+
 }
 
 extern "C"
@@ -105,6 +139,9 @@
 #ifdef VK_USE_PLATFORM_XLIB_KHR
 	{ VK_KHR_XLIB_SURFACE_EXTENSION_NAME, VK_KHR_XLIB_SURFACE_SPEC_VERSION },
 #endif
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+    { VK_MVK_MACOS_SURFACE_EXTENSION_NAME, VK_MVK_MACOS_SURFACE_SPEC_VERSION },
+#endif
 };
 
 static const VkExtensionProperties deviceExtensionProperties[] =
@@ -139,6 +176,8 @@
 	TRACE("(const VkInstanceCreateInfo* pCreateInfo = %p, const VkAllocationCallbacks* pAllocator = %p, VkInstance* pInstance = %p)",
 			pCreateInfo, pAllocator, pInstance);
 
+	initializeLibrary();
+
 	if(pCreateInfo->enabledLayerCount)
 	{
 		UNIMPLEMENTED("pCreateInfo->enabledLayerCount");
@@ -2526,7 +2565,7 @@
 	TRACE("(VkDevice device = %p, VkDescriptorSet descriptorSet = %p, VkDescriptorUpdateTemplate descriptorUpdateTemplate = %p, const void* pData = %p)",
 	      device, static_cast<void*>(descriptorSet), static_cast<void*>(descriptorUpdateTemplate), pData);
 
-	vk::Cast(descriptorUpdateTemplate)->updateDescriptorSet(descriptorSet, pData);
+	vk::Cast(descriptorUpdateTemplate)->updateDescriptorSet(vk::Cast(device), descriptorSet, pData);
 }
 
 VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalBufferProperties(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalBufferInfo* pExternalBufferInfo, VkExternalBufferProperties* pExternalBufferProperties)
@@ -2579,6 +2618,16 @@
 }
 #endif
 
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateMacOSSurfaceMVK(VkInstance instance, const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface)
+{
+    TRACE("(VkInstance instance = %p, VkMacOSSurfaceCreateInfoMVK* pCreateInfo = %p, VkAllocationCallbacks* pAllocator = %p, VkSurface* pSurface = %p)",
+          instance, pCreateInfo, pAllocator, pSurface);
+
+    return vk::MacOSSurfaceMVK::Create(pAllocator, pCreateInfo, pSurface);
+}
+#endif
+
 #ifndef __ANDROID__
 VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(VkInstance instance, VkSurfaceKHR surface, const VkAllocationCallbacks* pAllocator)
 {
diff --git a/src/Vulkan/vulkan.vcxproj b/src/Vulkan/vulkan.vcxproj
index 9da32ff..e6c5020 100644
--- a/src/Vulkan/vulkan.vcxproj
+++ b/src/Vulkan/vulkan.vcxproj
@@ -67,6 +67,7 @@
       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <EnableCOMDATFolding>true</EnableCOMDATFolding>

@@ -93,6 +94,7 @@
       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <TreatSpecificWarningsAsErrors>4018;5038;4838</TreatSpecificWarningsAsErrors>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <ModuleDefinitionFile>libvk_swiftshader.def</ModuleDefinitionFile>

@@ -303,4 +305,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

   <ImportGroup Label="ExtensionTargets">

   </ImportGroup>

-</Project>

+</Project>
\ No newline at end of file
diff --git a/src/WSI/MacOSSurfaceMVK.h b/src/WSI/MacOSSurfaceMVK.h
new file mode 100644
index 0000000..7822fb6
--- /dev/null
+++ b/src/WSI/MacOSSurfaceMVK.h
@@ -0,0 +1,45 @@
+// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SWIFTSHADER_MACOSSURFACEMVK_HPP
+#define SWIFTSHADER_MACOSSURFACEMVK_HPP
+
+#include "Vulkan/VkObject.hpp"
+#include "VkSurfaceKHR.hpp"
+#include "vulkan/vulkan_macos.h"
+
+namespace vk {
+
+class MetalLayer;
+
+class MacOSSurfaceMVK : public SurfaceKHR, public ObjectBase<MacOSSurfaceMVK, VkSurfaceKHR> {
+public:
+    MacOSSurfaceMVK(const VkMacOSSurfaceCreateInfoMVK *pCreateInfo, void *mem);
+
+    void destroySurface(const VkAllocationCallbacks *pAllocator) override;
+
+    static size_t ComputeRequiredAllocationSize(const VkMacOSSurfaceCreateInfoMVK *pCreateInfo);
+
+    void getSurfaceCapabilities(VkSurfaceCapabilitiesKHR *pSurfaceCapabilities) const override;
+
+    virtual void attachImage(PresentImage* image) override {}
+    virtual void detachImage(PresentImage* image) override {}
+    void present(PresentImage* image) override;
+
+private:
+    MetalLayer* metalLayer = nullptr;
+};
+
+}
+#endif //SWIFTSHADER_MACOSSURFACEMVK_HPP
diff --git a/src/WSI/MacOSSurfaceMVK.mm b/src/WSI/MacOSSurfaceMVK.mm
new file mode 100644
index 0000000..090cc20
--- /dev/null
+++ b/src/WSI/MacOSSurfaceMVK.mm
@@ -0,0 +1,144 @@
+// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "MacOSSurfaceMVK.h"
+#include "Vulkan/VkDeviceMemory.hpp"
+#include "Vulkan/VkImage.hpp"
+
+#include <Metal/Metal.h>
+#include <QuartzCore/CAMetalLayer.h>
+#include <AppKit/NSView.h>
+
+namespace vk {
+
+class MetalLayer
+{
+public:
+    void init(const void* pView)
+    {
+        view = nullptr;
+        layer = nullptr;
+
+        id<NSObject> obj = (id<NSObject>)pView;
+
+        if([obj isKindOfClass: [NSView class]])
+        {
+            if(!NSThread.isMainThread)
+            {
+                UNREACHABLE("MetalLayer::init(): not called from main thread");
+            }
+            view = (NSView*)[obj retain];
+
+            obj = view.layer;
+            if ([obj isKindOfClass: [CAMetalLayer class]])
+            {
+                layer = (CAMetalLayer*)[obj retain];
+            }
+            else
+            {
+                UNREACHABLE("MetalLayer::init(): view doesn't have metal backed layer");
+            }
+        }
+    }
+
+    void release()
+    {
+        if(layer)
+        {
+            [layer release];
+        }
+
+        if(view)
+        {
+            [view release];
+        }
+    }
+
+    VkExtent2D getExtent() const
+    {
+        if(layer)
+        {
+            CGSize drawSize = layer.bounds.size;
+            CGFloat scaleFactor = layer.contentsScale;
+            drawSize.width = trunc(drawSize.width * scaleFactor);
+            drawSize.height = trunc(drawSize.height * scaleFactor);
+            return { static_cast<uint32_t>(drawSize.width), static_cast<uint32_t>(drawSize.height) };
+        }
+        else
+        {
+            return { 0, 0 };
+        }
+    }
+
+    id<CAMetalDrawable> getNextDrawable() const
+    {
+        if(layer)
+        {
+            return [layer nextDrawable];
+        }
+
+        return nil;
+    }
+
+private:
+    NSView* view;
+    CAMetalLayer* layer;
+};
+
+MacOSSurfaceMVK::MacOSSurfaceMVK(const VkMacOSSurfaceCreateInfoMVK *pCreateInfo, void *mem) :
+    metalLayer(reinterpret_cast<MetalLayer*>(mem))
+{
+    metalLayer->init(pCreateInfo->pView);
+}
+
+void MacOSSurfaceMVK::destroySurface(const VkAllocationCallbacks *pAllocator)
+{
+    if(metalLayer)
+    {
+        metalLayer->release();
+    }
+
+    vk::deallocate(metalLayer, pAllocator);
+}
+
+size_t MacOSSurfaceMVK::ComputeRequiredAllocationSize(const VkMacOSSurfaceCreateInfoMVK *pCreateInfo)
+{
+    return sizeof(MetalLayer);
+}
+
+void MacOSSurfaceMVK::getSurfaceCapabilities(VkSurfaceCapabilitiesKHR *pSurfaceCapabilities) const
+{
+    SurfaceKHR::getSurfaceCapabilities(pSurfaceCapabilities);
+
+    VkExtent2D extent = metalLayer->getExtent();
+    pSurfaceCapabilities->currentExtent = extent;
+    pSurfaceCapabilities->minImageExtent = extent;
+    pSurfaceCapabilities->maxImageExtent = extent;
+}
+
+void MacOSSurfaceMVK::present(PresentImage* image)
+{
+    auto drawable = metalLayer->getNextDrawable();
+    if(drawable)
+    {
+        VkExtent3D extent = image->getImage()->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0);
+        [drawable.texture replaceRegion:MTLRegionMake2D(0, 0, extent.width, extent.height)
+                          mipmapLevel:0
+                          withBytes:image->getImageMemory()->getOffsetPointer(0)
+                          bytesPerRow:image->getImage()->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0)];
+        [drawable present];
+    }
+}
+
+}
diff --git a/src/WSI/VkSurfaceKHR.cpp b/src/WSI/VkSurfaceKHR.cpp
index 92fd426..f704ceb 100644
--- a/src/WSI/VkSurfaceKHR.cpp
+++ b/src/WSI/VkSurfaceKHR.cpp
@@ -24,6 +24,7 @@
 static const VkSurfaceFormatKHR surfaceFormats[] =
 {
 	{VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR},
+	{VK_FORMAT_B8G8R8A8_SRGB, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR},
 };
 
 static const VkPresentModeKHR presentModes[] =
diff --git a/tests/GLESUnitTests/unittests.cpp b/tests/GLESUnitTests/unittests.cpp
index db4de08..7951e27 100644
--- a/tests/GLESUnitTests/unittests.cpp
+++ b/tests/GLESUnitTests/unittests.cpp
@@ -2612,8 +2612,8 @@
 	Uninitialize();
 }
 
-// Test IOSurface pbuffers cannot be made current
-TEST_F(IOSurfaceClientBufferTest, MakeCurrentDisallowed)
+// Test IOSurface pbuffers can be made current
+TEST_F(IOSurfaceClientBufferTest, MakeCurrentAllowed)
 {
 	Initialize(3, false);
 
@@ -2623,8 +2623,8 @@
 		EGLSurface pbuffer = createIOSurfacePbuffer(clientBufferWrapper.getClientBuffer(), 10, 10, 0, GL_BGRA_EXT, GL_UNSIGNED_BYTE);
 
 		EGLBoolean result = eglMakeCurrent(getDisplay(), pbuffer, pbuffer, getContext());
-		EXPECT_EQ((EGLBoolean)EGL_FALSE, result);
-		EXPECT_EQ(EGL_BAD_SURFACE, eglGetError());
+		EXPECT_EQ((EGLBoolean)EGL_TRUE, result);
+		EXPECT_NO_EGL_ERROR();
 	}
 
 	Uninitialize();
diff --git a/tests/VulkanUnitTests/VulkanUnitTests.vcxproj b/tests/VulkanUnitTests/VulkanUnitTests.vcxproj
index df85535..099dc28 100644
--- a/tests/VulkanUnitTests/VulkanUnitTests.vcxproj
+++ b/tests/VulkanUnitTests/VulkanUnitTests.vcxproj
@@ -70,6 +70,7 @@
       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

@@ -93,6 +94,7 @@
       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <SubSystem>Console</SubSystem>

diff --git a/tests/fuzzers/VertexRoutineFuzzer.cpp b/tests/fuzzers/VertexRoutineFuzzer.cpp
index d63e293..28327b1 100644
--- a/tests/fuzzers/VertexRoutineFuzzer.cpp
+++ b/tests/fuzzers/VertexRoutineFuzzer.cpp
@@ -203,11 +203,10 @@
 	sw::VertexProgram program(state, bytecodeShader.get());
 	program.generate();
 
-	sw::Routine *routine = program("VertexRoutine");
+	auto routine = program("VertexRoutine");
 	assert(routine);
 	const void *entry = routine->getEntry();
 	assert(entry); (void)entry;
-	delete routine;
 
 	return 0;
 }
diff --git a/tests/fuzzers/VertexRoutineFuzzer.vcxproj b/tests/fuzzers/VertexRoutineFuzzer.vcxproj
index a3aec3b..1c75243 100644
--- a/tests/fuzzers/VertexRoutineFuzzer.vcxproj
+++ b/tests/fuzzers/VertexRoutineFuzzer.vcxproj
@@ -97,6 +97,7 @@
       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <AdditionalDependencies>WS2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>

@@ -151,6 +152,7 @@
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>

       <AdditionalOptions>/permissive- %(AdditionalOptions)</AdditionalOptions>

       <MultiProcessorCompilation>true</MultiProcessorCompilation>

+      <DisableSpecificWarnings>4267</DisableSpecificWarnings>

     </ClCompile>

     <Link>

       <EnableCOMDATFolding>true</EnableCOMDATFolding>

diff --git a/tests/regres/main.go b/tests/regres/main.go
index 0549874..fcfeea7 100644
--- a/tests/regres/main.go
+++ b/tests/regres/main.go
@@ -993,9 +993,9 @@
 			continue
 		}
 		switch {
-		case old.Status.Passing() && new.Status.Failing():
+		case !old.Status.Failing() && new.Status.Failing():
 			broken = append(broken, test)
-		case old.Status.Failing() && new.Status.Passing():
+		case !old.Status.Passing() && new.Status.Passing():
 			fixed = append(fixed, test)
 		case old.Status != new.Status:
 			changed = append(changed, test)