Subzero: fix CoroutineBegin generation

Rework how instructions are injected at the top of the CoroutineBegin
function by getting rid of replaceEntryNode, which attempted to replace
the entry node with a non-entry one. This seemed to work on all targets,
except for Windows x86-32 (Win32) when passing enough arguments to
Coroutines. In this case, it would crash in the code generated right
after this injected code. It looks like the code in replaceEntryNode is
not quite right, resulting in Subzero creating needless stack allocs per
argument, and ultimately generating invalid offsets from the stack
pointer.

Instead of fixing replaceEntryNode, I now simply remember the entryNode
for CoroutineBegin to use, adding the rest to a separate node for
basicBlock, and when finalizing the function, I connect entryNode to the
initial basicBlock node via a branch. This way, there is not messing
around with function's node list.

This not only fixes the crash, but gets rid of the needless stack
allocs per arg.

Bug: angleproject:4482
Change-Id: I13f9c8c43ee07f35302208d9876e6fbdf0b1ad26
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/42608
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 4607f9c..df72f66 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -58,37 +58,6 @@
 // These functions only accept and return Subzero (Ice) types, and do not access any globals.
 namespace {
 namespace sz {
-void replaceEntryNode(Ice::Cfg *function, Ice::CfgNode *newEntryNode)
-{
-	ASSERT_MSG(function->getEntryNode() != nullptr, "Function should have an entry node");
-
-	if(function->getEntryNode() == newEntryNode)
-	{
-		return;
-	}
-
-	// Make this the new entry node
-	function->setEntryNode(newEntryNode);
-
-	// Reorder nodes so that new entry block comes first. This is required
-	// by Cfg::renumberInstructions, which expects the first node in the list
-	// to be the entry node.
-	{
-		auto nodes = function->getNodes();
-
-		// TODO(amaiorano): Fast path if newEntryNode is last? Can avoid linear search.
-
-		auto iter = std::find(nodes.begin(), nodes.end(), newEntryNode);
-		ASSERT_MSG(iter != nodes.end(), "New node should be in the function's node list");
-
-		nodes.erase(iter);
-		nodes.insert(nodes.begin(), newEntryNode);
-
-		// swapNodes replaces its nodes with the input one, and renumbers them,
-		// so our new entry node will be 0, and the previous will be 1.
-		function->swapNodes(nodes);
-	}
-}
 
 Ice::Cfg *createFunction(Ice::GlobalContext *context, Ice::Type returnType, const std::vector<Ice::Type> &paramTypes)
 {
@@ -258,6 +227,8 @@
 
 Ice::GlobalContext *context = nullptr;
 Ice::Cfg *function = nullptr;
+Ice::CfgNode *entryBlock = nullptr;
+Ice::CfgNode *basicBlockTop = nullptr;
 Ice::CfgNode *basicBlock = nullptr;
 Ice::CfgLocalAllocatorScope *allocator = nullptr;
 rr::ELFMemoryStreamer *routine = nullptr;
@@ -491,12 +462,17 @@
 	return Ice::typeWidthInBytes(T(type));
 }
 
-static void createRetVoidIfNoRet()
+static void finalizeFunction()
 {
+	// Create a return if none was added
 	if(::basicBlock->getInsts().empty() || ::basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
 	{
 		Nucleus::createRetVoid();
 	}
+
+	// Connect the entry block to the top of the initial basic block
+	auto br = Ice::InstBr::create(::function, ::basicBlockTop);
+	::entryBlock->appendInst(br);
 }
 
 using ElfHeader = std::conditional<sizeof(void *) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
@@ -947,7 +923,9 @@
 	delete ::out;
 	::out = nullptr;
 
+	::entryBlock = nullptr;
 	::basicBlock = nullptr;
+	::basicBlockTop = nullptr;
 
 	::codegenMutex.unlock();
 }
@@ -1064,7 +1042,7 @@
 
 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
 {
-	createRetVoidIfNoRet();
+	finalizeFunction();
 	return rr::acquireRoutine({ ::function }, { name }, cfgEdit);
 }
 
@@ -1105,7 +1083,9 @@
 {
 	ASSERT(::function == nullptr);
 	ASSERT(::allocator == nullptr);
+	ASSERT(::entryBlock == nullptr);
 	ASSERT(::basicBlock == nullptr);
+	ASSERT(::basicBlockTop == nullptr);
 
 	::function = sz::createFunction(::context, T(returnType), T(paramTypes));
 
@@ -1115,7 +1095,9 @@
 	// TODO: Get rid of this as a global, and create scoped allocs in every Nucleus function instead.
 	::allocator = new Ice::CfgLocalAllocatorScope(::function);
 
-	::basicBlock = ::function->getEntryNode();
+	::entryBlock = ::function->getEntryNode();
+	::basicBlock = ::function->makeNode();
+	::basicBlockTop = ::basicBlock;
 }
 
 Value *Nucleus::getArgument(unsigned int index)
@@ -4617,29 +4599,13 @@
 		//        ... <REACTOR CODE> ...
 		//
 
-		// Save original entry block and current block, and create a new entry block and make it current.
-		// This new block will be used to inject code above the begin routine's existing code. We make
-		// this block branch to the original entry block as the last instruction.
-		auto origEntryBB = ::function->getEntryNode();
-		auto origCurrBB = ::basicBlock;
-		auto newBB = ::function->makeNode();
-		sz::replaceEntryNode(::function, newBB);
-		::basicBlock = newBB;
-
 		//        this->handle = coro::getHandleParam();
-		this->handle = sz::Call(::function, ::basicBlock, coro::getHandleParam);
+		this->handle = sz::Call(::function, ::entryBlock, coro::getHandleParam);
 
 		//        YieldType promise;
 		//        coro::setPromisePtr(handle, &promise); // For await
 		this->promise = sz::allocateStackVariable(::function, T(::coroYieldType));
-		sz::Call(::function, ::basicBlock, coro::setPromisePtr, this->handle, this->promise);
-
-		// Branch to original entry block
-		auto br = Ice::InstBr::create(::function, origEntryBB);
-		::basicBlock->appendInst(br);
-
-		// Restore current block for future instructions
-		::basicBlock = origCurrBB;
+		sz::Call(::function, ::entryBlock, coro::setPromisePtr, this->handle, this->promise);
 	}
 
 	// Adds instructions for Yield() calls at the current location of the main coroutine function.
@@ -4853,7 +4819,7 @@
 		// Finish generating coroutine functions
 		{
 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
-			createRetVoidIfNoRet();
+			finalizeFunction();
 		}
 
 		auto awaitFunc = ::coroGen->generateAwaitFunction();
@@ -4873,7 +4839,7 @@
 	{
 		{
 			Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
-			createRetVoidIfNoRet();
+			finalizeFunction();
 		}
 
 		::coroYieldType = nullptr;