Implement scalar replacement of aggregates

This change implements the Scalar Replacement of Aggregates (SRoA)
optimization. Since Reactor only supports array aggregates, this
replaces arrays which are only indexed by static indices with individual
variables for each element.

The ReactorArray test is optimized from:

 sub         rsp,38h
 mov         dword ptr [rsp],1
 mov         dword ptr [rsp+4],2
 mov         eax,dword ptr [rsp]
 mov         ecx,dword ptr [rsp+4]
 mov         dword ptr [rsp],ecx
 mov         dword ptr [rsp+4],eax
 mov         eax,dword ptr [rsp+4]
 add         eax,dword ptr [rsp]
 add         rsp,38h
 ret

Into:

 sub         rsp,20h
 mov         eax,2
 add         eax,1
 add         rsp,20h
 ret

Which is identical to the CArray test's generated code.

Bug: b/179279298
Change-Id: Ie45261f2ac783bdc22fee06adf03ebd588f3ebc3
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/52428
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 8ec7af1..acc9eae 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -36,6 +36,7 @@
 
 	void eliminateDeadCode();
 	void propagateAlloca();
+	void performScalarReplacementOfAggregates();
 	void eliminateUnitializedLoads();
 	void eliminateLoadsFollowingSingleStore();
 	void optimizeStoresInSingleBasicBlock();
@@ -43,6 +44,7 @@
 	void replace(Ice::Inst *instruction, Ice::Operand *newValue);
 	void deleteInstruction(Ice::Inst *instruction);
 	bool isDead(Ice::Inst *instruction);
+	bool isStaticallyIndexedArray(Ice::Operand *allocaAddress);
 
 	static const Ice::InstIntrinsic *asLoadSubVector(const Ice::Inst *instruction);
 	static const Ice::InstIntrinsic *asStoreSubVector(const Ice::Inst *instruction);
@@ -113,6 +115,9 @@
 	// Eliminate allocas which store the address of other allocas.
 	propagateAlloca();
 
+	// Replace arrays with individual elements if only statically indexed.
+	performScalarReplacementOfAggregates();
+
 	eliminateUnitializedLoads();
 	eliminateLoadsFollowingSingleStore();
 	optimizeStoresInSingleBasicBlock();
@@ -185,6 +190,117 @@
 	}
 }
 
+Ice::Type pointerType()
+{
+	if(sizeof(void *) == 8)
+	{
+		return Ice::IceType_i64;
+	}
+	else
+	{
+		return Ice::IceType_i32;
+	}
+}
+
+// Replace arrays with individual elements if only statically indexed.
+void Optimizer::performScalarReplacementOfAggregates()
+{
+	std::vector<Ice::InstAlloca *> newAllocas;
+
+	Ice::CfgNode *entryBlock = function->getEntryNode();
+	Ice::InstList &instList = entryBlock->getInsts();
+
+	for(Ice::Inst &inst : instList)
+	{
+		if(inst.isDeleted())
+		{
+			continue;
+		}
+
+		auto *alloca = llvm::dyn_cast<Ice::InstAlloca>(&inst);
+
+		if(!alloca)
+		{
+			break;  // Allocas are all at the top
+		}
+
+		uint32_t sizeInBytes = llvm::cast<Ice::ConstantInteger32>(alloca->getSizeInBytes())->getValue();
+		uint32_t alignInBytes = alloca->getAlignInBytes();
+
+		// This pass relies on array elements to be naturally aligned (i.e. matches the type size).
+		assert(sizeInBytes >= alignInBytes);
+		assert(sizeInBytes % alignInBytes == 0);
+		uint32_t elementCount = sizeInBytes / alignInBytes;
+
+		Ice::Operand *address = alloca->getDest();
+
+		if(isStaticallyIndexedArray(address))
+		{
+			// Delete the old array.
+			alloca->setDeleted();
+
+			// Allocate new stack slots for each element.
+			std::vector<Ice::Variable *> newAddress(elementCount);
+			auto *bytes = Ice::ConstantInteger32::create(context, Ice::IceType_i32, alignInBytes);
+
+			for(uint32_t i = 0; i < elementCount; i++)
+			{
+				newAddress[i] = function->makeVariable(pointerType());
+				auto *alloca = Ice::InstAlloca::create(function, newAddress[i], bytes, alignInBytes);
+				setDefinition(newAddress[i], alloca);
+
+				newAllocas.push_back(alloca);
+			}
+
+			Uses uses = *getUses(address);  // Hard copy
+
+			for(auto *use : uses)
+			{
+				assert(!use->isDeleted());
+
+				if(isLoad(*use))  // Direct use of base address
+				{
+					use->replaceSource(asLoadSubVector(use) ? 1 : 0, newAddress[0]);
+					getUses(newAddress[0])->insert(newAddress[0], use);
+				}
+				else if(isStore(*use))  // Direct use of base address
+				{
+					use->replaceSource(asStoreSubVector(use) ? 2 : 1, newAddress[0]);
+					getUses(newAddress[0])->insert(newAddress[0], use);
+				}
+				else  // Statically indexed use
+				{
+					auto *arithmetic = llvm::cast<Ice::InstArithmetic>(use);
+
+					if(arithmetic->getOp() == Ice::InstArithmetic::Add)
+					{
+						auto *rhs = arithmetic->getSrc(1);
+						int32_t offset = llvm::cast<Ice::ConstantInteger32>(rhs)->getValue();
+
+						assert(offset % alignInBytes == 0);
+						int32_t index = offset / alignInBytes;
+						assert(static_cast<uint32_t>(index) < elementCount);
+
+						replace(arithmetic, newAddress[index]);
+					}
+					else
+						assert(false && "Mismatch between isStaticallyIndexedArray() and scalarReplacementOfAggregates()");
+				}
+			}
+		}
+	}
+
+	// After looping over all the old allocas, add any new ones that replace them.
+	// They're added to the front in reverse order, to retain their original order.
+	for(size_t i = newAllocas.size(); i-- != 0;)
+	{
+		if(!isDead(newAllocas[i]))
+		{
+			instList.push_front(newAllocas[i]);
+		}
+	}
+}
+
 void Optimizer::eliminateDeadCode()
 {
 	bool modified;
@@ -582,6 +698,7 @@
 		return;
 	}
 
+	assert(!instruction->getDest() || getUses(instruction->getDest())->empty());
 	instruction->setDeleted();
 
 	for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
@@ -639,6 +756,52 @@
 	return false;
 }
 
+bool Optimizer::isStaticallyIndexedArray(Ice::Operand *allocaAddress)
+{
+	auto &uses = *getUses(allocaAddress);
+
+	for(auto *use : uses)
+	{
+		// Direct load from base address.
+		if(isLoad(*use) && use->getLoadAddress() == allocaAddress)
+		{
+			continue;  // This is fine.
+		}
+
+		if(isStore(*use))
+		{
+			// Can either be the address we're storing to, or the data we're storing.
+			if(use->getStoreAddress() == allocaAddress)
+			{
+				continue;
+			}
+			else
+			{
+				// propagateAlloca() eliminates most of the stores of the address itself.
+				// For the cases it doesn't handle, assume SRoA is not feasible.
+				return false;
+			}
+		}
+
+		// Pointer arithmetic is fine if it only uses constants.
+		auto *arithmetic = llvm::dyn_cast<Ice::InstArithmetic>(use);
+		if(arithmetic && arithmetic->getOp() == Ice::InstArithmetic::Add)
+		{
+			auto *rhs = arithmetic->getSrc(1);
+
+			if(llvm::isa<Ice::Constant>(rhs))
+			{
+				continue;
+			}
+		}
+
+		// If there's any other type of use, bail out.
+		return false;
+	}
+
+	return true;
+}
+
 const Ice::InstIntrinsic *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
 {
 	if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsic>(instruction))
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index ada1072..1e0cf63 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -101,7 +101,7 @@
 
 	auto bytes = Ice::ConstantInteger32::create(function->getContext(), Ice::IceType_i32, totalSize);
 	auto address = function->makeVariable(getPointerType(type));
-	auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize);
+	auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
 	function->getEntryNode()->getInsts().push_front(alloca);
 
 	return address;
@@ -1104,7 +1104,7 @@
 
 	auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
 	auto address = ::function->makeVariable(T(getPointerType(t)));
-	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
+	auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);  // SRoA depends on the alignment to match the type size.
 	::function->getEntryNode()->getInsts().push_front(alloca);
 
 	return V(address);
diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index 51a7531..fc8665a 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -508,6 +508,94 @@
 	EXPECT_EQ(result, 222);
 }
 
+TEST(ReactorUnitTests, ModifyLocalThroughPointer)
+{
+	FunctionT<int(void)> function;
+	{
+		Int a = 1;
+
+		Pointer<Int> p = &a;
+		Pointer<Pointer<Int>> pp = &p;
+
+		Pointer<Int> q = *pp;
+		*q = 3;
+
+		Return(a);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine();
+	EXPECT_EQ(result, 3);
+}
+
+TEST(ReactorUnitTests, ScalarReplacementOfArray)
+{
+	FunctionT<int(void)> function;
+	{
+		Array<Int, 2> a;
+		a[0] = 1;
+		a[1] = 2;
+
+		Return(a[0] + a[1]);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine();
+	EXPECT_EQ(result, 3);
+}
+
+TEST(ReactorUnitTests, CArray)
+{
+	FunctionT<int(void)> function;
+	{
+		Int a[2];
+		a[0] = 1;
+		a[1] = 2;
+
+		auto x = a[0];
+		a[0] = a[1];
+		a[1] = x;
+
+		Return(a[0] + a[1]);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int result = routine();
+	EXPECT_EQ(result, 3);
+}
+
+// SRoA should replace the array elements with scalars, which in turn enables
+// eliminating all loads and stores.
+TEST(ReactorUnitTests, ReactorArray)
+{
+	FunctionT<int(void)> function;
+	{
+		Array<Int, 2> a;
+		a[0] = 1;
+		a[1] = 2;
+
+		Int x = a[0];
+		a[0] = a[1];
+		a[1] = x;
+
+		Return(a[0] + a[1]);
+	}
+
+	Nucleus::setOptimizerCallback([](const Nucleus::OptimizerReport *report) {
+		EXPECT_EQ(report->allocas, 0);
+		EXPECT_EQ(report->loads, 0);
+		EXPECT_EQ(report->stores, 0);
+	});
+
+	auto routine = function(testName().c_str());
+
+	int result = routine();
+	EXPECT_EQ(result, 3);
+}
+
 TEST(ReactorUnitTests, SubVectorLoadStore)
 {
 	FunctionT<int(void *, void *)> function;