Propagate stores of stack addresses into other local variables

Pointers to the base address of arrays often get stored and then loaded
multiple time. This change adds an optimization pass to replace these
loads with the result of the alloca of the array, and eliminates the
allocas of the pointer variables.

The PropagateAlloca unit test without this optimization produces:

 sub         rsp,38h
 mov         dword ptr [rsp+8],16h
 cmp         ecx,0
 je          000002543C4E407E
 lea         rax,[rsp+8]
 mov         qword ptr [rsp],rax
 mov         rax,qword ptr [rsp]
 mov         eax,dword ptr [rax]
 add         rsp,38h
 ret

With the optimization pass it becomes:

 sub         rsp,38h
 mov         dword ptr [rsp],16h
 cmp         ecx,0
 jne         0000015DC3B33074
 mov         eax,dword ptr [rsp]
 add         rsp,38h
 ret

Also add a couple of unit tests for corner cases where the propagation
is not safe to perform.

Bug: b/179279298
Change-Id: I784899319bf5360f47c6fbc22fb82134d135dbfc
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/52468
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 4ac06de..a3901d7 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -28,7 +28,9 @@
 
 private:
 	void analyzeUses(Ice::Cfg *function);
+
 	void eliminateDeadCode();
+	void propagateAlloca();
 	void eliminateUnitializedLoads();
 	void eliminateLoadsFollowingSingleStore();
 	void optimizeStoresInSingleBasicBlock();
@@ -95,7 +97,13 @@
 
 	analyzeUses(function);
 
+	// Start by eliminating any dead code, to avoid redundant work for the
+	// subsequent optimization passes.
 	eliminateDeadCode();
+
+	// Eliminate allocas which store the address of other allocas.
+	propagateAlloca();
+
 	eliminateUnitializedLoads();
 	eliminateLoadsFollowingSingleStore();
 	optimizeStoresInSingleBasicBlock();
@@ -109,6 +117,63 @@
 	operandsWithUses.clear();
 }
 
+// Eliminates allocas which store the address of other allocas.
+void Optimizer::propagateAlloca()
+{
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+	Ice::InstList &instList = entryBlock->getInsts();
+
+	for(Ice::Inst &inst : instList)
+	{
+		if(inst.isDeleted())
+		{
+			continue;
+		}
+
+		auto *alloca = llvm::dyn_cast<Ice::InstAlloca>(&inst);
+
+		if(!alloca)
+		{
+			break;  // Allocas are all at the top
+		}
+
+		// Look for stores of this alloca's address.
+		Ice::Operand *address = alloca->getDest();
+		Uses uses = *getUses(address);  // Hard copy
+
+		for(auto *store : uses)
+		{
+			if(isStore(*store) && store->getData() == address)
+			{
+				Ice::Operand *dest = store->getStoreAddress();
+				Ice::Variable *destVar = llvm::dyn_cast<Ice::Variable>(dest);
+				Ice::Inst *def = destVar ? getDefinition(destVar) : nullptr;
+
+				// If the address is stored into another stack variable, eliminate the latter.
+				if(def && def->getKind() == Ice::Inst::Alloca)
+				{
+					Uses destUses = *getUses(dest);  // Hard copy
+
+					// Make sure the only store into the stack variable is this address, and that all of its other uses are loads.
+					// This prevents dynamic array loads/stores to be replaced by a scalar.
+					if((destUses.stores.size() == 1) && (destUses.loads.size() == destUses.size() - 1))
+					{
+						for(auto *load : destUses.loads)
+						{
+							replace(load, address);
+						}
+
+						// The address is now only stored, never loaded, so the store can be eliminated, together with its alloca.
+						assert(getUses(dest)->size() == 1);
+						deleteInstruction(store);
+						assert(def->isDeleted());
+					}
+				}
+			}
+		}
+	}
+}
+
 void Optimizer::eliminateDeadCode()
 {
 	bool modified;
diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index b9eeb07..aa91dbe 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -376,6 +376,83 @@
 	EXPECT_EQ(result, 7);
 }
 
+// This test excercises the Optimizer::propagateAlloca() optimization pass.
+// The pointer variable should not get stored to / loaded from memory.
+// TODO(b/180665600): Check that the optimization took place.
+TEST(ReactorUnitTests, PropagateAlloca)
+{
+	FunctionT<int(int)> function;
+	{
+		Int b = function.Arg<0>();
+
+		Int a = 22;
+		Pointer<Int> p;
+
+		// This branch materializes both `a` and `p`, and ensures single basic block
+		// optimizations don't also eliminate the pointer store and load.
+		If(b != 0)  // TODO(b/179922668): Support If(b)
+		{
+			p = &a;
+		}
+
+		Return(Int(*p));  // TODO(b/179694472): Support Return(*p)
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine(true);
+	EXPECT_EQ(result, 22);
+}
+
+// Corner case for Optimizer::propagateAlloca(). It should not replace loading of `p`
+// with the addres of `a`, since it also got the address of `b` assigned.
+TEST(ReactorUnitTests, PointerToPointer)
+{
+	FunctionT<int()> function;
+	{
+		Int a = 444;
+		Int b = 555;
+		Int c = 666;
+
+		Pointer<Int> p = &a;
+		Pointer<Pointer<Int>> pp = &p;
+		p = &b;
+
+		Return(Int(*Pointer<Int>(*pp)));  // TODO(b/179694472): Support **pp
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine();
+	EXPECT_EQ(result, 555);
+}
+
+// Corner case for Optimizer::propagateAlloca(). It should not replace loading of `p[i]`
+// with any of the addresses of the `a`, `b`, or `c`.
+TEST(ReactorUnitTests, ArrayOfPointersToLocals)
+{
+	FunctionT<int(int)> function;
+	{
+		Int i = function.Arg<0>();
+
+		Int a = 111;
+		Int b = 222;
+		Int c = 333;
+
+		Array<Pointer<Int>, 3> p;
+		p[0] = &a;
+		p[1] = &b;
+		p[2] = &c;
+
+		Return(Int(*Pointer<Int>(p[i])));  // TODO(b/179694472): Support *p[i]
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine(1);
+	EXPECT_EQ(result, 222);
+}
+
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
 	FunctionT<int(void *, void *)> function;