Implement propagation of stores to loads in single basic blocks

For loads following stores in the same basic block, replace their result
with the data that was stored.

It transforms the output of the StoresInMultipleBlocks test from:

 sub         rsp,38h
 mov         dword ptr [rsp],0Dh
 cmp         ecx,0
 je          a
 mov         dword ptr [rsp],4
 add         dword ptr [rsp],3
b:
 mov         eax,dword ptr [rsp]
 add         rsp,38h
 ret
a:
 mov         dword ptr [rsp],6
 add         dword ptr [rsp],5
 jmp         b

Into:

 sub         rsp,38h
 mov         dword ptr [rsp],0Dh
 cmp         ecx,0
 je          a
 mov         dword ptr [rsp],4
 mov         eax,4
 add         eax,3
 mov         dword ptr [rsp],eax
b:
 mov         eax,dword ptr [rsp]
 add         rsp,38h
 ret
a:
 mov         dword ptr [rsp],6
 mov         eax,6
 add         eax,5
 mov         dword ptr [rsp],eax
 jmp         b

While at first this might seem like a regression, note that
`add [rsp],3` performs both a load and a store. The optimization pass
eliminated two load operations from this test. The redundant stores will
be eliminated by a subsequent change.

Also adds a unit test for the case where store-to-load propagation
should not be performed due to an indirect store through a pointer
happening in between.

Bug: b/179668593
Change-Id: I6ca133ac4e77bbbc3efd517dff6e8dee6d2dc147
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/52533
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index acc9eae..abb7d4c 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -17,6 +17,7 @@
 #include "src/IceCfg.h"
 #include "src/IceCfgNode.h"
 
+#include <unordered_map>
 #include <vector>
 
 namespace {
@@ -37,6 +38,7 @@
 	void eliminateDeadCode();
 	void propagateAlloca();
 	void performScalarReplacementOfAggregates();
+	void optimizeSingleBasicBlockLoadsStores();
 	void eliminateUnitializedLoads();
 	void eliminateLoadsFollowingSingleStore();
 	void optimizeStoresInSingleBasicBlock();
@@ -45,6 +47,7 @@
 	void deleteInstruction(Ice::Inst *instruction);
 	bool isDead(Ice::Inst *instruction);
 	bool isStaticallyIndexedArray(Ice::Operand *allocaAddress);
+	Ice::InstAlloca *allocaOf(Ice::Operand *address);
 
 	static const Ice::InstIntrinsic *asLoadSubVector(const Ice::Inst *instruction);
 	static const Ice::InstIntrinsic *asStoreSubVector(const Ice::Inst *instruction);
@@ -118,6 +121,9 @@
 	// Replace arrays with individual elements if only statically indexed.
 	performScalarReplacementOfAggregates();
 
+	// Iterate through basic blocks to propagate loads following stores.
+	optimizeSingleBasicBlockLoadsStores();
+
 	eliminateUnitializedLoads();
 	eliminateLoadsFollowingSingleStore();
 	optimizeStoresInSingleBasicBlock();
@@ -621,6 +627,61 @@
 	}
 }
 
+// Iterate through basic blocks to propagate stores to subsequent loads.
+void Optimizer::optimizeSingleBasicBlockLoadsStores()
+{
+	for(Ice::CfgNode *block : function->getNodes())
+	{
+		// For each stack variable keep track of the last store instruction.
+		std::unordered_map<const Ice::InstAlloca *, const Ice::Inst *> lastStoreTo;
+
+		for(Ice::Inst &inst : block->getInsts())
+		{
+			if(inst.isDeleted())
+			{
+				continue;
+			}
+
+			if(isStore(inst))
+			{
+				Ice::Operand *address = inst.getStoreAddress();
+
+				if(Ice::InstAlloca *alloca = allocaOf(address))
+				{
+					// Only consider this store for propagation if its address is not used as
+					// a pointer which could be used for indirect stores.
+					if(getUses(address)->areOnlyLoadStore())
+					{
+						lastStoreTo[alloca] = &inst;
+					}
+				}
+			}
+			else if(isLoad(inst))
+			{
+				if(Ice::InstAlloca *alloca = allocaOf(inst.getLoadAddress()))
+				{
+					auto entry = lastStoreTo.find(alloca);
+					if(entry != lastStoreTo.end())
+					{
+						const Ice::Inst *store = entry->second;
+
+						// Conservatively check that the types match.
+						// TODO(b/179668593): Also valid to propagate if the load is smaller or equal in size to the store.
+						if(loadTypeMatchesStore(&inst, store))
+						{
+							replace(&inst, store->getData());
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// This can leave some dead instructions. Specifically stores.
+	// TODO((b/179668593): Eliminate stores superseded by subsequent stores?
+	eliminateDeadCode();
+}
+
 void Optimizer::analyzeUses(Ice::Cfg *function)
 {
 	for(Ice::CfgNode *basicBlock : function->getNodes())
@@ -802,6 +863,15 @@
 	return true;
 }
 
+Ice::InstAlloca *Optimizer::allocaOf(Ice::Operand *address)
+{
+	Ice::Variable *addressVar = llvm::dyn_cast<Ice::Variable>(address);
+	Ice::Inst *def = addressVar ? getDefinition(addressVar) : nullptr;
+	Ice::InstAlloca *alloca = def ? llvm::dyn_cast<Ice::InstAlloca>(def) : nullptr;
+
+	return alloca;
+}
+
 const Ice::InstIntrinsic *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
 {
 	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsic>(instruction))
diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index fc8665a..ffd78fc 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -596,6 +596,76 @@
 	EXPECT_EQ(result, 3);
 }
 
+// Excercises the optimizeSingleBasicBlockLoadsStores optimization pass.
+TEST(ReactorUnitTests, StoresInMultipleBlocks)
+{
+	FunctionT<int(int)> function;
+	{
+		Int b = function.Arg<0>();
+
+		Int a = 13;
+
+		If(b != 0)  // TODO(b/179922668): Support If(b)
+		{
+			a = 4;
+			a = a + 3;
+		}
+		Else
+		{
+			a = 6;
+			a = a + 5;
+		}
+
+		Return(a);
+	}
+
+	Nucleus::setOptimizerCallback([](const Nucleus::OptimizerReport *report) {
+		EXPECT_EQ(report->allocas, 1);
+		EXPECT_EQ(report->loads, 1);
+		EXPECT_EQ(report->stores, 5);
+	});
+
+	auto routine = function(testName().c_str());
+
+	int result = routine(true);
+	EXPECT_EQ(result, 7);
+}
+
+// This is similar to the LoadAfterIndirectStore test except that the indirect
+// store is preceded by a direct store. The subsequent load should not be replaced
+// by the value written by the direct store.
+TEST(ReactorUnitTests, StoreBeforeIndirectStore)
+{
+	FunctionT<int(int)> function;
+	{
+		//Int b = function.Arg<0>();
+
+		Int b;
+		Pointer<Int> p = &b;
+		Int a = 13;
+
+		For(Int i = 0, i < 2, i++)
+		{
+			a = 10;
+
+			*p = 4;
+
+			// This load of `a` should not be replaced by the 10 written above, since
+			// in the second iteration `p` points to `a` and writes 4.
+			b = a;
+
+			p = &a;
+		}
+
+		Return(b);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine(true);
+	EXPECT_EQ(result, 4);
+}
+
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
 	FunctionT<int(void *, void *)> function;