Merge changes I2b7adc3c,I5873dfa8

* changes:
  Update Marl to ca8408f68
  Squashed 'third_party/marl/' changes from 64d123947..ca8408f68
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index fff67ad..83f8126 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -611,7 +611,8 @@
 
 UInt r11g11b10Pack(const Float4 &value)
 {
-	auto halfBits = floatToHalfBits(As<UInt4>(value), true);
+	// 10 and 11 bit floats are unsigned, so their minimal value is 0
+	auto halfBits = floatToHalfBits(As<UInt4>(Max(value, Float4(0.0f))), true);
 	// Truncates instead of rounding. See b/147900455
 	UInt4 truncBits = halfBits & UInt4(0x7FF00000, 0x7FF00000, 0x7FE00000, 0);
 	return (UInt(truncBits.x) >> 20) | (UInt(truncBits.y) >> 9) | (UInt(truncBits.z) << 1);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 9a83795..df72f66 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -58,37 +58,6 @@
 // These functions only accept and return Subzero (Ice) types, and do not access any globals.
 namespace {
 namespace sz {
-void replaceEntryNode(Ice::Cfg *function, Ice::CfgNode *newEntryNode)
-{
-	ASSERT_MSG(function->getEntryNode() != nullptr, "Function should have an entry node");
-
-	if(function->getEntryNode() == newEntryNode)
-	{
-		return;
-	}
-
-	// Make this the new entry node
-	function->setEntryNode(newEntryNode);
-
-	// Reorder nodes so that new entry block comes first. This is required
-	// by Cfg::renumberInstructions, which expects the first node in the list
-	// to be the entry node.
-	{
-		auto nodes = function->getNodes();
-
-		// TODO(amaiorano): Fast path if newEntryNode is last? Can avoid linear search.
-
-		auto iter = std::find(nodes.begin(), nodes.end(), newEntryNode);
-		ASSERT_MSG(iter != nodes.end(), "New node should be in the function's node list");
-
-		nodes.erase(iter);
-		nodes.insert(nodes.begin(), newEntryNode);
-
-		// swapNodes replaces its nodes with the input one, and renumbers them,
-		// so our new entry node will be 0, and the previous will be 1.
-		function->swapNodes(nodes);
-	}
-}
 
 Ice::Cfg *createFunction(Ice::GlobalContext *context, Ice::Type returnType, const std::vector<Ice::Type> &paramTypes)
 {
@@ -194,6 +163,8 @@
 template<typename Return, typename... CArgs, typename... RArgs>
 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Return(fptr)(CArgs...), RArgs &&... args)
 {
+	static_assert(sizeof...(CArgs) == sizeof...(RArgs), "Expected number of args don't match");
+
 	Ice::Type retTy = T(rr::CToReactorT<Return>::getType());
 	std::vector<Ice::Operand *> iceArgs{ std::forward<RArgs>(args)... };
 	return Call(function, basicBlock, retTy, reinterpret_cast<void const *>(fptr), iceArgs, false);
@@ -256,6 +227,8 @@
 
 Ice::GlobalContext *context = nullptr;
 Ice::Cfg *function = nullptr;
+Ice::CfgNode *entryBlock = nullptr;
+Ice::CfgNode *basicBlockTop = nullptr;
 Ice::CfgNode *basicBlock = nullptr;
 Ice::CfgLocalAllocatorScope *allocator = nullptr;
 rr::ELFMemoryStreamer *routine = nullptr;
@@ -489,12 +462,17 @@
 	return Ice::typeWidthInBytes(T(type));
 }
 
-static void createRetVoidIfNoRet()
+static void finalizeFunction()
 {
+	// Create a return if none was added
 	if(::basicBlock->getInsts().empty() || ::basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
 	{
 		Nucleus::createRetVoid();
 	}
+
+	// Connect the entry block to the top of the initial basic block
+	auto br = Ice::InstBr::create(::function, ::basicBlockTop);
+	::entryBlock->appendInst(br);
 }
 
 using ElfHeader = std::conditional<sizeof(void *) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
@@ -656,13 +634,23 @@
 	return symbolValue;
 }
 
-void *loadImage(uint8_t *const elfImage, size_t &codeSize, const char *functionName = nullptr)
+struct EntryPoint
 {
+	const void *entry;
+	size_t codeSize = 0;
+};
+
+std::vector<EntryPoint> loadImage(uint8_t *const elfImage, const std::vector<const char *> &functionNames)
+{
+	ASSERT(functionNames.size() > 0);
+	std::vector<EntryPoint> entryPoints(functionNames.size());
+
 	ElfHeader *elfHeader = (ElfHeader *)elfImage;
 
+	// TODO: assert?
 	if(!elfHeader->checkMagic())
 	{
-		return nullptr;
+		return {};
 	}
 
 	// Expect ELF bitness to match platform
@@ -682,7 +670,6 @@
 #endif
 
 	SectionHeader *sectionHeader = (SectionHeader *)(elfImage + elfHeader->e_shoff);
-	void *entry = nullptr;
 
 	for(int i = 0; i < elfHeader->e_shnum; i++)
 	{
@@ -690,17 +677,25 @@
 		{
 			if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
 			{
-				auto getCurrSectionName = [&]() {
+				auto findSectionNameEntryIndex = [&]() -> size_t {
 					auto sectionNameOffset = sectionHeader[elfHeader->e_shstrndx].sh_offset + sectionHeader[i].sh_name;
-					return reinterpret_cast<const char *>(elfImage + sectionNameOffset);
-				};
-				if(functionName && strstr(getCurrSectionName(), functionName) == nullptr)
-				{
-					continue;
-				}
+					const char *sectionName = reinterpret_cast<const char *>(elfImage + sectionNameOffset);
 
-				entry = elfImage + sectionHeader[i].sh_offset;
-				codeSize = sectionHeader[i].sh_size;
+					for(size_t j = 0; j < functionNames.size(); ++j)
+					{
+						if(strstr(sectionName, functionNames[j]) != nullptr)
+						{
+							return j;
+						}
+					}
+
+					UNREACHABLE("Failed to find executable section that matches input function names");
+					return static_cast<size_t>(-1);
+				};
+
+				size_t index = findSectionNameEntryIndex();
+				entryPoints[index].entry = elfImage + sectionHeader[i].sh_offset;
+				entryPoints[index].codeSize = sectionHeader[i].sh_size;
 			}
 		}
 		else if(sectionHeader[i].sh_type == SHT_REL)
@@ -725,7 +720,7 @@
 		}
 	}
 
-	return entry;
+	return entryPoints;
 }
 
 template<typename T>
@@ -796,18 +791,20 @@
 
 	void seek(uint64_t Off) override { position = Off; }
 
-	const void *getEntryByName(const char *name)
+	std::vector<EntryPoint> loadImageAndGetEntryPoints(const std::vector<const char *> &functionNames)
 	{
-		size_t codeSize = 0;
-		const void *entry = loadImage(&buffer[0], codeSize, name);
+		auto entryPoints = loadImage(&buffer[0], functionNames);
 
 #if defined(_WIN32)
 		FlushInstructionCache(GetCurrentProcess(), NULL, 0);
 #else
-		__builtin___clear_cache((char *)entry, (char *)entry + codeSize);
+		for(auto &entryPoint : entryPoints)
+		{
+			__builtin___clear_cache((char *)entryPoint.entry, (char *)entryPoint.entry + entryPoint.codeSize);
+		}
 #endif
 
-		return entry;
+		return entryPoints;
 	}
 
 	void finalize()
@@ -926,7 +923,9 @@
 	delete ::out;
 	::out = nullptr;
 
+	::entryBlock = nullptr;
 	::basicBlock = nullptr;
+	::basicBlockTop = nullptr;
 
 	::codegenMutex.unlock();
 }
@@ -1026,10 +1025,11 @@
 	objectWriter->writeNonUserSections();
 
 	// Done compiling functions, get entry pointers to each of them
-	for(size_t i = 0; i < Count; ++i)
+	auto entryPoints = ::routine->loadImageAndGetEntryPoints({ names, names + Count });
+	ASSERT(entryPoints.size() == Count);
+	for(size_t i = 0; i < entryPoints.size(); ++i)
 	{
-		const void *entry = ::routine->getEntryByName(names[i]);
-		::routine->setEntry(i, entry);
+		::routine->setEntry(i, entryPoints[i].entry);
 	}
 
 	::routine->finalize();
@@ -1042,7 +1042,7 @@
 
 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
 {
-	createRetVoidIfNoRet();
+	finalizeFunction();
 	return rr::acquireRoutine({ ::function }, { name }, cfgEdit);
 }
 
@@ -1083,7 +1083,9 @@
 {
 	ASSERT(::function == nullptr);
 	ASSERT(::allocator == nullptr);
+	ASSERT(::entryBlock == nullptr);
 	ASSERT(::basicBlock == nullptr);
+	ASSERT(::basicBlockTop == nullptr);
 
 	::function = sz::createFunction(::context, T(returnType), T(paramTypes));
 
@@ -1093,7 +1095,9 @@
 	// TODO: Get rid of this as a global, and create scoped allocs in every Nucleus function instead.
 	::allocator = new Ice::CfgLocalAllocatorScope(::function);
 
-	::basicBlock = ::function->getEntryNode();
+	::entryBlock = ::function->getEntryNode();
+	::basicBlock = ::function->makeNode();
+	::basicBlockTop = ::basicBlock;
 }
 
 Value *Nucleus::getArgument(unsigned int index)
@@ -4595,29 +4599,13 @@
 		//        ... <REACTOR CODE> ...
 		//
 
-		// Save original entry block and current block, and create a new entry block and make it current.
-		// This new block will be used to inject code above the begin routine's existing code. We make
-		// this block branch to the original entry block as the last instruction.
-		auto origEntryBB = ::function->getEntryNode();
-		auto origCurrBB = ::basicBlock;
-		auto newBB = ::function->makeNode();
-		sz::replaceEntryNode(::function, newBB);
-		::basicBlock = newBB;
-
 		//        this->handle = coro::getHandleParam();
-		this->handle = sz::Call(::function, ::basicBlock, coro::getHandleParam);
+		this->handle = sz::Call(::function, ::entryBlock, coro::getHandleParam);
 
 		//        YieldType promise;
 		//        coro::setPromisePtr(handle, &promise); // For await
 		this->promise = sz::allocateStackVariable(::function, T(::coroYieldType));
-		sz::Call(::function, ::basicBlock, coro::setPromisePtr, this->handle, this->promise);
-
-		// Branch to original entry block
-		auto br = Ice::InstBr::create(::function, origEntryBB);
-		::basicBlock->appendInst(br);
-
-		// Restore current block for future instructions
-		::basicBlock = origCurrBB;
+		sz::Call(::function, ::entryBlock, coro::setPromisePtr, this->handle, this->promise);
 	}
 
 	// Adds instructions for Yield() calls at the current location of the main coroutine function.
@@ -4719,7 +4707,7 @@
 		//         <resumeBlock>
 		//     }
 		Ice::CfgNode *bb = awaitFunc->getEntryNode();
-		Ice::Variable *done = sz::Call(awaitFunc, bb, coro::isDone);
+		Ice::Variable *done = sz::Call(awaitFunc, bb, coro::isDone, handle);
 		auto br = Ice::InstBr::create(awaitFunc, done, doneBlock, resumeBlock);
 		bb->appendInst(br);
 
@@ -4831,7 +4819,7 @@
 		// Finish generating coroutine functions
 		{
 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
-			createRetVoidIfNoRet();
+			finalizeFunction();
 		}
 
 		auto awaitFunc = ::coroGen->generateAwaitFunction();
@@ -4851,7 +4839,7 @@
 	{
 		{
 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
-			createRetVoidIfNoRet();
+			finalizeFunction();
 		}
 
 		::coroYieldType = nullptr;
diff --git a/src/Vulkan/BUILD.gn b/src/Vulkan/BUILD.gn
index 9ea2d28..19b7e79 100644
--- a/src/Vulkan/BUILD.gn
+++ b/src/Vulkan/BUILD.gn
@@ -101,8 +101,6 @@
     ]
   } else if (is_fuchsia) {
     sources += [ "VkSemaphoreExternalFuchsia.hpp" ]
-  } else {
-    sources += [ "VkSemaphoreExternalNone.hpp" ]
   }
 }
 
diff --git a/src/Vulkan/VkSemaphore.cpp b/src/Vulkan/VkSemaphore.cpp
index 72bee8b..8daef84 100644
--- a/src/Vulkan/VkSemaphore.cpp
+++ b/src/Vulkan/VkSemaphore.cpp
@@ -17,18 +17,6 @@
 #include "VkConfig.h"
 #include "VkStringify.hpp"
 
-#if SWIFTSHADER_EXTERNAL_SEMAPHORE_OPAQUE_FD
-#	if defined(__linux__) || defined(__ANDROID__)
-#		include "VkSemaphoreExternalLinux.hpp"
-#	else
-#		error "Missing VK_KHR_external_semaphore_fd implementation for this platform!"
-#	endif
-#elif VK_USE_PLATFORM_FUCHSIA
-#	include "VkSemaphoreExternalFuchsia.hpp"
-#else
-#	include "VkSemaphoreExternalNone.hpp"
-#endif
-
 #include "marl/blockingcall.h"
 #include "marl/conditionvariable.h"
 
@@ -38,11 +26,76 @@
 
 namespace vk {
 
+// This is a base abstract class for all external semaphore implementations
+// used in this source file.
+class Semaphore::External
+{
+public:
+	virtual ~External() = default;
+
+	// Initialize new instance with a given initial state.
+	virtual VkResult init(bool initialState) = 0;
+
+	virtual bool tryWait() = 0;
+	virtual void wait() = 0;
+	virtual void signal() = 0;
+
+	// For VK_KHR_external_semaphore_fd
+	virtual VkResult importOpaqueFd(int fd)
+	{
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+	}
+
+	virtual VkResult exportOpaqueFd(int *pFd)
+	{
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+	}
+
+#if VK_USE_PLATFORM_FUCHSIA
+	// For VK_FUCHSIA_external_semaphore
+	virtual VkResult importHandle(zx_handle_t handle)
+	{
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+	}
+	virtual VkResult exportHandle(zx_handle_t *pHandle)
+	{
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+	}
+#endif
+	// Pointer to previous temporary external instanc,e used for |tempExternal| only.
+	External *previous = nullptr;
+};
+
+}  // namespace vk
+
+#if SWIFTSHADER_EXTERNAL_SEMAPHORE_OPAQUE_FD
+#	if defined(__linux__) || defined(__ANDROID__)
+#		include "VkSemaphoreExternalLinux.hpp"
+#	else
+#		error "Missing VK_KHR_external_semaphore_fd implementation for this platform!"
+#	endif
+#elif VK_USE_PLATFORM_FUCHSIA
+#	include "VkSemaphoreExternalFuchsia.hpp"
+#endif
+
+namespace vk {
+
+// The bitmask of all external semaphore handle types supported by this source file.
+static const VkExternalSemaphoreHandleTypeFlags kSupportedTypes =
+#if SWIFTSHADER_EXTERNAL_SEMAPHORE_OPAQUE_FD
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
+#endif
+#if VK_USE_PLATFORM_FUCHSIA
+    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TEMP_ZIRCON_EVENT_BIT_FUCHSIA |
+#endif
+    0;
+
 namespace {
 
 struct SemaphoreCreateInfo
 {
 	bool exportSemaphore = false;
+	VkExternalSemaphoreHandleTypeFlags exportHandleTypes = 0;
 
 	// Create a new instance. The external instance will be allocated only
 	// the pCreateInfo->pNext chain indicates it needs to be exported.
@@ -57,9 +110,12 @@
 				{
 					const auto *exportInfo = reinterpret_cast<const VkExportSemaphoreCreateInfo *>(nextInfo);
 					exportSemaphore = true;
-					if(exportInfo->handleTypes != Semaphore::External::kExternalSemaphoreHandleType)
+					exportHandleTypes = exportInfo->handleTypes;
+					if((exportHandleTypes & ~kSupportedTypes) != 0)
 					{
-						UNSUPPORTED("exportInfo->handleTypes %d", int(exportInfo->handleTypes));
+						UNSUPPORTED("exportInfo->handleTypes 0x%X (supports 0x%X)",
+						            int(exportHandleTypes),
+						            int(kSupportedTypes));
 					}
 				}
 				break;
@@ -74,27 +130,30 @@
 
 void Semaphore::wait()
 {
-	if(external)
+	std::unique_lock<std::mutex> lock(mutex);
+	External *ext = tempExternal ? tempExternal : external;
+	if(ext)
 	{
-		if(!external->tryWait())
+		if(!ext->tryWait())
 		{
-			// Dispatch the external wait to a background thread.
+			// Dispatch the ext wait to a background thread.
 			// Even if this creates a new thread on each
 			// call, it is assumed that this is negligible
 			// compared with the actual semaphore wait()
 			// operation.
-			marl::blocking_call([this]() {
-				external->wait();
+			marl::blocking_call([ext, &lock]() {
+				lock.unlock();
+				ext->wait();
+				lock.lock();
 			});
 		}
 
-		// If the import was temporary, reset the semaphore to its
-		// permanent state by getting rid of |external|.
+		// If the import was temporary, reset the semaphore to its previous state.
 		// See "6.4.5. Importing Semaphore Payloads" in Vulkan 1.1 spec.
-		if(temporaryImport)
+		if(ext == tempExternal)
 		{
-			deallocateExternal();
-			temporaryImport = false;
+			tempExternal = ext->previous;
+			deallocateExternal(ext);
 		}
 	}
 	else
@@ -105,11 +164,13 @@
 
 void Semaphore::signal()
 {
-	if(external)
+	std::unique_lock<std::mutex> lock(mutex);
+	External *ext = tempExternal ? tempExternal : external;
+	if(ext)
 	{
 		// Assumes that signalling an external semaphore is non-blocking,
 		// so it can be performed directly either from a fiber or thread.
-		external->signal();
+		ext->signal();
 	}
 	else
 	{
@@ -121,16 +182,22 @@
     : allocator(pAllocator)
 {
 	SemaphoreCreateInfo info(pCreateInfo);
-	if(info.exportSemaphore)
-	{
-		allocateExternal();
-		external->init();
-	}
+	exportableHandleTypes = info.exportHandleTypes;
 }
 
 void Semaphore::destroy(const VkAllocationCallbacks *pAllocator)
 {
-	deallocateExternal();
+	while(tempExternal)
+	{
+		External *ext = tempExternal;
+		tempExternal = ext->previous;
+		deallocateExternal(ext);
+	}
+	if(external)
+	{
+		deallocateExternal(external);
+		external = nullptr;
+	}
 }
 
 size_t Semaphore::ComputeRequiredAllocationSize(const VkSemaphoreCreateInfo *pCreateInfo)
@@ -139,78 +206,140 @@
 	return 0;
 }
 
-void Semaphore::allocateExternal()
+template<class EXTERNAL>
+Semaphore::External *Semaphore::allocateExternal()
 {
-	ASSERT(external == nullptr);
-	external = reinterpret_cast<Semaphore::External *>(
-	    vk::allocate(sizeof(Semaphore::External), vk::REQUIRED_MEMORY_ALIGNMENT, allocator));
-	new(external) Semaphore::External();
+	auto *ext = reinterpret_cast<Semaphore::External *>(
+	    vk::allocate(sizeof(EXTERNAL), alignof(EXTERNAL), allocator));
+	new(ext) EXTERNAL();
+	return ext;
 }
 
-void Semaphore::deallocateExternal()
+void Semaphore::deallocateExternal(Semaphore::External *ext)
 {
-	if(external)
+	ext->~External();
+	vk::deallocate(ext, allocator);
+}
+
+template<typename ALLOC_FUNC, typename IMPORT_FUNC>
+VkResult Semaphore::importPayload(bool temporaryImport,
+                                  ALLOC_FUNC alloc_func,
+                                  IMPORT_FUNC import_func)
+{
+	std::unique_lock<std::mutex> lock(mutex);
+
+	// Create new External instance if needed.
+	External *ext = external;
+	if(temporaryImport || !ext)
 	{
-		vk::deallocate(external, allocator);
-		external = nullptr;
+		ext = alloc_func();
 	}
+	VkResult result = import_func(ext);
+	if(result != VK_SUCCESS)
+	{
+		if(temporaryImport || !external)
+		{
+			deallocateExternal(ext);
+		}
+		return result;
+	}
+
+	if(temporaryImport)
+	{
+		ext->previous = tempExternal;
+		tempExternal = ext;
+	}
+	else if(!external)
+	{
+		external = ext;
+	}
+	return VK_SUCCESS;
+}
+
+template<typename ALLOC_FUNC, typename EXPORT_FUNC>
+VkResult Semaphore::exportPayload(ALLOC_FUNC alloc_func, EXPORT_FUNC export_func)
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	// Sanity check, do not try to export a semaphore that has a temporary import.
+	if(tempExternal != nullptr)
+	{
+		TRACE("Cannot export semaphore with a temporary import!");
+		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+	}
+	// Allocate |external| if it doesn't exist yet.
+	if(!external)
+	{
+		External *ext = alloc_func();
+		VkResult result = ext->init(internal.isSignalled());
+		if(result != VK_SUCCESS)
+		{
+			deallocateExternal(ext);
+			return result;
+		}
+		external = ext;
+	}
+	return export_func(external);
 }
 
 #if SWIFTSHADER_EXTERNAL_SEMAPHORE_OPAQUE_FD
-VkResult Semaphore::importFd(int fd, bool tempImport)
+VkResult Semaphore::importFd(int fd, bool temporaryImport)
 {
-	std::unique_lock<std::mutex> lock(mutex);
-	if(!external)
-	{
-		allocateExternal();
-	}
-	VkResult result = external->importFd(fd);
-	if(result != VK_SUCCESS)
-	{
-		deallocateExternal();
-	}
-	else
-	{
-		temporaryImport = tempImport;
-	}
-	return result;
+	return importPayload(
+	    temporaryImport,
+	    [this]() {
+		    return allocateExternal<OpaqueFdExternalSemaphore>();
+	    },
+	    [fd](External *ext) {
+		    return ext->importOpaqueFd(fd);
+	    });
 }
 
 VkResult Semaphore::exportFd(int *pFd)
 {
-	std::unique_lock<std::mutex> lock(mutex);
-	if(!external)
+	if((exportableHandleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) == 0)
 	{
-		TRACE("Cannot export non-external semaphore");
+		TRACE("Cannot export semaphore as opaque FD (exportableHandleType = 0x%X, want 0x%X)",
+		      exportableHandleTypes,
+		      VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
+
 		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 	}
-	return external->exportFd(pFd);
+
+	return exportPayload([this]() { return allocateExternal<OpaqueFdExternalSemaphore>(); },
+	                     [pFd](External *ext) {
+		                     return ext->exportOpaqueFd(pFd);
+	                     });
 }
 #endif  // SWIFTSHADER_EXTERNAL_SEMAPHORE_OPAQUE_FD
 
 #if VK_USE_PLATFORM_FUCHSIA
-VkResult Semaphore::importHandle(zx_handle_t handle, bool tempImport)
+VkResult Semaphore::importHandle(zx_handle_t handle, bool temporaryImport)
 {
-	std::unique_lock<std::mutex> lock(mutex);
-	if(!external)
-	{
-		allocateExternal();
-	}
-	// NOTE: Imports are just moving a handle so cannot fail.
-	external->importHandle(handle);
-	temporaryImport = tempImport;
-	return VK_SUCCESS;
+	return importPayload(
+	    temporaryImport,
+	    [this]() {
+		    return allocateExternal<ZirconEventExternalSemaphore>();
+	    },
+	    [handle](External *ext) {
+		    return ext->importHandle(handle);
+	    });
 }
 
 VkResult Semaphore::exportHandle(zx_handle_t *pHandle)
 {
-	std::unique_lock<std::mutex> lock(mutex);
-	if(!external)
+	if((exportableHandleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TEMP_ZIRCON_EVENT_BIT_FUCHSIA) == 0)
 	{
-		TRACE("Cannot export non-external semaphore");
+		TRACE("Cannot export semaphore as Zircon handle (exportableHandleType = 0x%X, want 0x%X)",
+		      exportableHandleTypes,
+		      VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TEMP_ZIRCON_EVENT_BIT_FUCHSIA);
+
 		return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 	}
-	return external->exportHandle(pHandle);
+
+	return exportPayload([this]() { return allocateExternal<ZirconEventExternalSemaphore>(); },
+	                     [pHandle](External *ext) {
+		                     return ext->exportHandle(pHandle);
+	                     });
 }
 #endif  // VK_USE_PLATFORM_FUCHSIA
 
diff --git a/src/Vulkan/VkSemaphore.hpp b/src/Vulkan/VkSemaphore.hpp
index 54004d8..b67d4d9 100644
--- a/src/Vulkan/VkSemaphore.hpp
+++ b/src/Vulkan/VkSemaphore.hpp
@@ -58,14 +58,99 @@
 	class External;
 
 private:
-	void allocateExternal();
-	void deallocateExternal();
+	// Small technical note on how semaphores are imported/exported with Vulkan:
+	//
+	// - A Vulkan Semaphore objects has a "payload", corresponding to a
+	//   simple atomic boolean flag.
+	//
+	// - A Vulkan Semaphore object can be "exported": this creates a
+	//   platform-specific handle / descriptor (which can be passed to other
+	//   processes), and is linked in some way to the original semaphore's
+	//   payload.
+	//
+	// - Similarly, said handle / descriptor can be "imported" into a Vulkan
+	//   Semaphore object. By default, that semaphore loses its payload, and
+	//   instead uses the one referenced / shared through the descriptor.
+	//
+	//   Hence if semaphore A exports its payload through a descriptor that
+	//   is later imported into semaphore B, then both A and B will use/share
+	//   the same payload (i.e. signal flag), making cross-process
+	//   synchronization possible.
+	//
+	// - There are also "temporary imports", where the target semaphore's
+	//   payload is not lost, but is simply hidden/stashed. But the next wait()
+	//   operation on the same semaphore should remove the temporary import,
+	//   and restore the previous payload.
+	//
+	// - There are many handle / descriptor types, which are listed through
+	//   the VkExternalSemaphoreHandleTypeFlagBits. A given Vulkan
+	//   implementation might support onle one or several at the same time
+	//   (e.g. on Linux or Android, it could support both OPAQUE_FD_BIT and
+	//   SYNC_FD_BIT, while on Windows, it would be OPAQUE_WIN32_BIT +
+	//   OPAQUE_WIN32_KMT_BIT + D3D12_FENCE_BIT).
+	//
+	// - To be able to export a semaphore, VkCreateSemaphore() must be called
+	//   with a VkSemaphoreCreateInfo that lists the types of all possible
+	//   platform-specific handles the semaphore could be exported to
+	//   (e.g. on Linux, it is possible to specify that a semaphore might be
+	//   exported as an opaque FD, or as a Linux Sync FD).
+	//
+	//   However, which exact type is however only determined later by the
+	//   export operation itself (e.g. vkGetSemaphoreFdKHR() could be called to export
+	//   either a VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT or a
+	//   VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT).
+	//
+	//   Once a semaphore has been exported as one type, it is not possible
+	//   to export the same payload with a different type (though the spec
+	//   doesn't seem to be explicit about this, it's simply impossible in
+	//   general).
+	//
+	// This leads to the following design:
+	//
+	//   - |internal| is a simple marl::Event that represents the semaphore's
+	//     payload when it is not exported, or imported non-temporarily.
+	//
+	//   - |external| points to an external semaphore payload. It is created
+	//     on demand if the semaphore is exported or imported non-temporarily.
+	//     Note that once |external| is created, |internal| is ignored.
+	//
+	//   - |tempExternal| points to a linked-list of temporary external
+	//     semaphore payloads. The list head corresponds to the most recent
+	//     temporary import.
+	//
+
+	// Internal template to allocate a new External implementation.
+	template<class EXTERNAL>
+	External *allocateExternal();
+
+	void deallocateExternal(External *ext);
+
+	// Used internally to import an external payload.
+	// |temporaryImport| is true iff the import is temporary.
+	// |alloc_func| is callable that allocates a new External instance of the
+	// appropriate type.
+	// |import_func| is callable that takes a single parameter, which
+	// corresponds to the external handle/descriptor, and returns a VkResult
+	// values.
+	template<typename ALLOC_FUNC, typename IMPORT_FUNC>
+	VkResult importPayload(bool temporaryImport,
+	                       ALLOC_FUNC alloc_func,
+	                       IMPORT_FUNC import_func);
+
+	// Used internally to export a given payload.
+	// |alloc_func| is a callable that allocates a new External instance of
+	// the appropriate type.
+	// |export_func| is a callable that takes a pointer to an External instance,
+	// and a pointer to a handle/descriptor, and returns a VkResult.
+	template<typename ALLOC_FUNC, typename EXPORT_FUNC>
+	VkResult exportPayload(ALLOC_FUNC alloc_func, EXPORT_FUNC export_func);
 
 	const VkAllocationCallbacks *allocator = nullptr;
-	marl::Event internal;
+	VkExternalSemaphoreHandleTypeFlags exportableHandleTypes = (VkExternalSemaphoreHandleTypeFlags)0;
 	std::mutex mutex;
+	marl::Event internal;
 	External *external = nullptr;
-	bool temporaryImport = false;
+	External *tempExternal = nullptr;
 };
 
 static inline Semaphore *Cast(VkSemaphore object)
diff --git a/src/Vulkan/VkSemaphoreExternalFuchsia.hpp b/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
index a45f0b8..90f68d4 100644
--- a/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
+++ b/src/Vulkan/VkSemaphoreExternalFuchsia.hpp
@@ -26,38 +26,44 @@
 
 namespace vk {
 
-class Semaphore::External
+class ZirconEventExternalSemaphore : public Semaphore::External
 {
 public:
-	// The type of external semaphore handle types supported by this implementation.
-	static const VkExternalSemaphoreHandleTypeFlags kExternalSemaphoreHandleType =
-	    VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TEMP_ZIRCON_EVENT_BIT_FUCHSIA;
-
-	// Default constructor. Note that one should call either init() or
-	// importFd() before any call to wait() or signal().
-	External() = default;
-
-	~External()
+	~ZirconEventExternalSemaphore()
 	{
 		zx_handle_close(handle);
 	}
 
-	void init()
+	VkResult init(bool initialValue) override
 	{
 		zx_status_t status = zx_event_create(0, &handle);
 		if(status != ZX_OK)
 		{
-			ABORT("zx_event_create() returned %d", status);
+			TRACE("zx_event_create() returned %d", status);
+			return VK_ERROR_INITIALIZATION_FAILED;
 		}
+		if(initialValue)
+		{
+			status = zx_object_signal(handle, 0, ZX_EVENT_SIGNALED);
+			if(status != ZX_OK)
+			{
+				TRACE("zx_object_signal() returned %d", status);
+				zx_handle_close(handle);
+				handle = ZX_HANDLE_INVALID;
+				return VK_ERROR_INITIALIZATION_FAILED;
+			}
+		}
+		return VK_SUCCESS;
 	}
 
-	void importHandle(zx_handle_t new_handle)
+	VkResult importHandle(zx_handle_t new_handle) override
 	{
 		zx_handle_close(handle);
 		handle = new_handle;
+		return VK_SUCCESS;
 	}
 
-	VkResult exportHandle(zx_handle_t *pHandle) const
+	VkResult exportHandle(zx_handle_t *pHandle) override
 	{
 		zx_handle_t new_handle = ZX_HANDLE_INVALID;
 		zx_status_t status = zx_handle_duplicate(handle, ZX_RIGHT_SAME_RIGHTS, &new_handle);
@@ -70,7 +76,7 @@
 		return VK_SUCCESS;
 	}
 
-	void wait()
+	void wait() override
 	{
 		zx_signals_t observed = 0;
 		zx_status_t status = zx_object_wait_one(
@@ -91,7 +97,7 @@
 		}
 	}
 
-	bool tryWait()
+	bool tryWait() override
 	{
 		zx_signals_t observed = 0;
 		zx_status_t status = zx_object_wait_one(
@@ -113,7 +119,7 @@
 		return true;
 	}
 
-	void signal()
+	void signal() override
 	{
 		zx_status_t status = zx_object_signal(handle, 0, ZX_EVENT_SIGNALED);
 		if(status != ZX_OK)
diff --git a/src/Vulkan/VkSemaphoreExternalLinux.hpp b/src/Vulkan/VkSemaphoreExternalLinux.hpp
index 36bac26..d4e4cd2 100644
--- a/src/Vulkan/VkSemaphoreExternalLinux.hpp
+++ b/src/Vulkan/VkSemaphoreExternalLinux.hpp
@@ -44,7 +44,8 @@
 class SharedSemaphore
 {
 public:
-	SharedSemaphore()
+	SharedSemaphore(bool initialValue)
+	    : signaled(initialValue)
 	{
 		pthread_mutexattr_t mattr;
 		pthread_mutexattr_init(&mattr);
@@ -129,20 +130,13 @@
 
 namespace vk {
 
-class Semaphore::External
+class OpaqueFdExternalSemaphore : public Semaphore::External
 {
 public:
-	// The type of external semaphore handle types supported by this implementation.
-	static const VkExternalSemaphoreHandleTypeFlags kExternalSemaphoreHandleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
-
-	// Default constructor. Note that one should call either init() or
-	// importFd() before any call to wait() or signal().
-	External() = default;
-
-	~External() { close(); }
+	~OpaqueFdExternalSemaphore() { unmapRegion(); }
 
 	// Initialize instance by creating a new shared memory region.
-	void init()
+	VkResult init(bool initialState) override
 	{
 		// Allocate or import the region's file descriptor.
 		const size_t size = sw::memoryPageSize();
@@ -153,24 +147,30 @@
 		snprintf(name, sizeof(name), "SwiftShader.Semaphore.%d", ++counter);
 		if(!memfd.allocate(name, size))
 		{
-			ABORT("memfd.allocate() returned %s", strerror(errno));
+			TRACE("memfd.allocate() returned %s", strerror(errno));
+			return VK_ERROR_INITIALIZATION_FAILED;
 		}
-		mapRegion(size, true);
+		if(!mapRegion(size, true, initialState))
+			return VK_ERROR_INITIALIZATION_FAILED;
+
+		return VK_SUCCESS;
 	}
 
 	// Import an existing semaphore through its file descriptor.
-	VkResult importFd(int fd)
+	VkResult importOpaqueFd(int fd) override
 	{
-		close();
+		unmapRegion();
 		memfd.importFd(fd);
-		mapRegion(sw::memoryPageSize(), false);
+		if(!mapRegion(sw::memoryPageSize(), false, false))
+			return VK_ERROR_INITIALIZATION_FAILED;
+
 		return VK_SUCCESS;
 	}
 
 	// Export the current semaphore as a duplicated file descriptor to the same
 	// region. This can be consumed by importFd() running in a different
 	// process.
-	VkResult exportFd(int *pFd) const
+	VkResult exportOpaqueFd(int *pFd) override
 	{
 		int fd = memfd.exportFd();
 		if(fd < 0)
@@ -181,24 +181,23 @@
 		return VK_SUCCESS;
 	}
 
-	void wait()
+	void wait() override
 	{
 		semaphore->wait();
 	}
 
-	bool tryWait()
+	bool tryWait() override
 	{
 		return semaphore->tryWait();
 	}
 
-	void signal()
+	void signal() override
 	{
 		semaphore->signal();
 	}
 
 private:
-	// Unmap the semaphore if needed and close its file descriptor.
-	void close()
+	void unmapRegion()
 	{
 		if(semaphore)
 		{
@@ -213,23 +212,25 @@
 	}
 
 	// Remap the shared region and setup the semaphore or increment its reference count.
-	void mapRegion(size_t size, bool needInitialization)
+	bool mapRegion(size_t size, bool needsInitialization, bool initialValue)
 	{
 		// Map the region into memory and point the semaphore to it.
 		void *addr = memfd.mapReadWrite(0, size);
 		if(!addr)
 		{
-			ABORT("mmap() failed: %s", strerror(errno));
+			TRACE("mmap() failed: %s", strerror(errno));
+			return false;
 		}
 		semaphore = reinterpret_cast<SharedSemaphore *>(addr);
-		if(needInitialization)
+		if(needsInitialization)
 		{
-			new(semaphore) SharedSemaphore();
+			new(semaphore) SharedSemaphore(initialValue);
 		}
 		else
 		{
 			semaphore->addRef();
 		}
+		return true;
 	}
 
 	LinuxMemFd memfd;
diff --git a/src/Vulkan/VkSemaphoreExternalNone.hpp b/src/Vulkan/VkSemaphoreExternalNone.hpp
deleted file mode 100644
index 9592e3f..0000000
--- a/src/Vulkan/VkSemaphoreExternalNone.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef VK_SEMAPHORE_EXTERNAL_NONE_H_
-#define VK_SEMAPHORE_EXTERNAL_NONE_H_
-
-namespace vk {
-
-// Empty external sempahore implementation.
-class Semaphore::External
-{
-public:
-	// The type of external semaphore handle types supported by this implementation.
-	static const VkExternalSemaphoreHandleTypeFlags kExternalSemaphoreHandleType = 0;
-
-	void init() {}
-
-	void wait() {}
-
-	bool tryWait() { return true; }
-
-	void signal() {}
-
-private:
-	int dummy;
-};
-
-}  // namespace vk
-
-#endif  // VK_SEMAPHORE_EXTERNAL_NONE_H_
diff --git a/third_party/llvm-subzero/include/llvm/Support/Compiler.h b/third_party/llvm-subzero/include/llvm/Support/Compiler.h
index 55148a4..803fd48 100644
--- a/third_party/llvm-subzero/include/llvm/Support/Compiler.h
+++ b/third_party/llvm-subzero/include/llvm/Support/Compiler.h
@@ -243,6 +243,15 @@
 #define LLVM_FALLTHROUGH
 #endif
 
+/// LLVM_REQUIRE_CONSTANT_INITIALIZATION - Apply this to globals to ensure that
+/// they are constant initialized.
+#if __has_cpp_attribute(clang::require_constant_initialization)
+#define LLVM_REQUIRE_CONSTANT_INITIALIZATION                                   \
+  [[clang::require_constant_initialization]]
+#else
+#define LLVM_REQUIRE_CONSTANT_INITIALIZATION
+#endif
+
 /// LLVM_EXTENSION - Support compilers where we have a keyword to suppress
 /// pedantic diagnostics.
 #ifdef __GNUC__
diff --git a/third_party/llvm-subzero/include/llvm/Support/ManagedStatic.h b/third_party/llvm-subzero/include/llvm/Support/ManagedStatic.h
index 7ce86ee..e4ebd7c 100644
--- a/third_party/llvm-subzero/include/llvm/Support/ManagedStatic.h
+++ b/third_party/llvm-subzero/include/llvm/Support/ManagedStatic.h
@@ -36,18 +36,37 @@
   static void call(void *Ptr) { delete[](T *)Ptr; }
 };
 
+// If the current compiler is MSVC 2017 or earlier, then we have to work around
+// a bug where MSVC emits code to perform dynamic initialization even if the
+// class has a constexpr constructor. Instead, fall back to the C++98 strategy
+// where there are no constructors or member initializers. We can remove this
+// when MSVC 2019 (19.20+) is our minimum supported version.
+#if !defined(__clang__) && defined(_MSC_VER) && _MSC_VER < 1920
+#define LLVM_AVOID_CONSTEXPR_CTOR
+#endif
+
 /// ManagedStaticBase - Common base class for ManagedStatic instances.
 class ManagedStaticBase {
 protected:
+#ifndef LLVM_AVOID_CONSTEXPR_CTOR
+  mutable std::atomic<void *> Ptr{};
+  mutable void (*DeleterFn)(void *) = nullptr;
+  mutable const ManagedStaticBase *Next = nullptr;
+#else
   // This should only be used as a static variable, which guarantees that this
   // will be zero initialized.
   mutable std::atomic<void *> Ptr;
-  mutable void (*DeleterFn)(void*);
+  mutable void (*DeleterFn)(void *);
   mutable const ManagedStaticBase *Next;
+#endif
 
   void RegisterManagedStatic(void *(*creator)(), void (*deleter)(void*)) const;
 
 public:
+#ifndef LLVM_AVOID_CONSTEXPR_CTOR
+  constexpr ManagedStaticBase() = default;
+#endif
+
   /// isConstructed - Return true if this object has not been created yet.
   bool isConstructed() const { return Ptr != nullptr; }
 
diff --git a/third_party/llvm-subzero/lib/Support/CommandLine.cpp b/third_party/llvm-subzero/lib/Support/CommandLine.cpp
index fa1782c..3e77cd0 100644
--- a/third_party/llvm-subzero/lib/Support/CommandLine.cpp
+++ b/third_party/llvm-subzero/lib/Support/CommandLine.cpp
@@ -383,11 +383,16 @@
   GlobalParser->registerCategory(this);
 }
 
-// A special subcommand representing no subcommand
-ManagedStatic<SubCommand> llvm::cl::TopLevelSubCommand;
+// A special subcommand representing no subcommand. It is particularly important
+// that this ManagedStatic uses constant initailization and not dynamic
+// initialization because it is referenced from cl::opt constructors, which run
+// dynamically in an arbitrary order.
+LLVM_REQUIRE_CONSTANT_INITIALIZATION ManagedStatic<SubCommand>
+llvm::cl::TopLevelSubCommand;
 
 // A special subcommand that can be used to put an option into all subcommands.
-ManagedStatic<SubCommand> llvm::cl::AllSubCommands;
+LLVM_REQUIRE_CONSTANT_INITIALIZATION ManagedStatic<SubCommand>
+llvm::cl::AllSubCommands;
 
 void SubCommand::registerSubCommand() {
   GlobalParser->registerSubCommand(this);
diff --git a/third_party/marl/src/osfiber_x86.c b/third_party/marl/src/osfiber_x86.c
index 6c486aa..cac72cb 100644
--- a/third_party/marl/src/osfiber_x86.c
+++ b/third_party/marl/src/osfiber_x86.c
@@ -25,12 +25,19 @@
                            uint32_t stack_size,
                            void (*target)(void*),
                            void* arg) {
+  // The stack pointer needs to be 16-byte aligned when making a 'call'.
+  // The 'call' instruction automatically pushes the return instruction to the
+  // stack (4-bytes), before making the jump.
+  // The marl_fiber_swap() assembly function does not use 'call', instead it
+  // uses 'jmp', so we need to offset the ESP pointer by 4 bytes so that the
+  // stack is still 16-byte aligned when the return target is stack-popped by
+  // the callee.
   uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
   ctx->EIP = (uintptr_t)&marl_fiber_trampoline;
-  ctx->ESP = (uintptr_t)&stack_top[-3];
-  stack_top[-1] = (uintptr_t)arg;
-  stack_top[-2] = (uintptr_t)target;
-  stack_top[-3] = 0;  // No return target.
+  ctx->ESP = (uintptr_t)&stack_top[-5];
+  stack_top[-3] = (uintptr_t)arg;
+  stack_top[-4] = (uintptr_t)target;
+  stack_top[-5] = 0;  // No return target.
 }
 
 #endif  // defined(__i386__)