GLES: Remove hardcoded shader limits

Perform whole-shader analysis to determine usage limits.
Use these limits to allocate compile time and runtime arrays.

Removes the constants:
	MAX_SHADER_NESTED_LOOPS
	MAX_SHADER_NESTED_IFS
	MAX_SHADER_CALL_STACK_SIZE

Also switched to using dynamic containers to remove the MAX_SHADER_CALL_SITES limit, which I believe to have been buggy and broken.

Bug: b/123587120
Change-Id: I89be80072183ac2aac28124df236888309e7207c
Reviewed-on: https://swiftshader-review.googlesource.com/c/24668
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Common/Types.hpp b/src/Common/Types.hpp
index fac6d36..7259b20 100644
--- a/src/Common/Types.hpp
+++ b/src/Common/Types.hpp
@@ -152,11 +152,13 @@
 		return v;
 	}
 
-	template <int limit> class BoundedIndex
+	class BoundedIndex
 	{
 	public:
 		BoundedIndex(int index) : index(index) {}
 
+		inline void setLimit(int limit) { this->limit = limit; }
+
 		inline int operator++(int) { return index++; }
 		inline int operator--(int) { return index--; }
 		inline void operator=(int i) { index = i; }
@@ -190,6 +192,7 @@
 
 	private:
 		int index = 0;
+		int limit = 0;
 	};
 
 	// The OFFSET macro is a generalization of the offsetof() macro defined in <cstddef>.
diff --git a/src/Main/Config.hpp b/src/Main/Config.hpp
index f0e9563..46ba961 100644
--- a/src/Main/Config.hpp
+++ b/src/Main/Config.hpp
@@ -97,9 +97,6 @@
 		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
 		RENDERTARGETS = 8,
 		NUM_TEMPORARY_REGISTERS = 4096,
-		MAX_SHADER_CALL_SITES = 2048,
-		MAX_SHADER_NESTED_LOOPS = 4,
-		MAX_SHADER_NESTED_IFS = 24 + 24,
 		MAX_SHADER_CALL_STACK_SIZE = 64,
 		MAX_SHADER_ENABLE_STACK_SIZE = 1 + 24,
 	};
diff --git a/src/Shader/PixelProgram.cpp b/src/Shader/PixelProgram.cpp
index ca44d19..843ba32 100644
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -25,6 +25,39 @@
 	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
 	extern bool fullPixelPositionRegister;
 
+	PixelProgram::PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
+			PixelRoutine(state, shader),
+			r(shader->indirectAddressableTemporaries),
+			aL(shader->getLimits().loops),
+			increment(shader->getLimits().loops),
+			iteration(shader->getLimits().loops),
+			callStack(shader->getLimits().stack)
+	{
+		auto limits = shader->getLimits();
+		ifDepth.setLimit(limits.ifs);
+		loopRepDepth.setLimit(limits.loops);
+		currentLabel.setLimit(limits.functions);
+
+		ifFalseBlock.resize(limits.ifs);
+		loopRepTestBlock.resize(limits.loops);
+		loopRepEndBlock.resize(limits.loops);
+		labelBlock.resize(limits.functions);
+		isConditionalIf.resize(limits.ifs);
+
+		loopDepth = -1;
+		enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+		if(shader->containsBreakInstruction())
+		{
+			enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+
+		if(shader->containsContinueInstruction())
+		{
+			enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+	}
+
 	void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
 	{
 		if(shader->getShaderModel() >= 0x0300)
diff --git a/src/Shader/PixelProgram.hpp b/src/Shader/PixelProgram.hpp
index b8548d7..44b5d21 100644
--- a/src/Shader/PixelProgram.hpp
+++ b/src/Shader/PixelProgram.hpp
@@ -18,33 +18,14 @@
 #include "PixelRoutine.hpp"
 #include "SamplerCore.hpp"
 
+#include <unordered_map>
+
 namespace sw
 {
 	class PixelProgram : public PixelRoutine
 	{
 	public:
-		PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
-			PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries)
-		{
-			for(int i = 0; i < MAX_SHADER_CALL_SITES; ++i)
-			{
-				labelBlock[i] = 0;
-			}
-
-			loopDepth = -1;
-			enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-
-			if(shader->containsBreakInstruction())
-			{
-				enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-			}
-
-			if(shader->containsContinueInstruction())
-			{
-				enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-			}
-		}
-
+		PixelProgram(const PixelProcessor::State &state, const PixelShader *shader);
 		virtual ~PixelProgram() {}
 
 	protected:
@@ -67,13 +48,13 @@
 
 		// DX9 specific variables
 		Vector4f p0;
-		Array<Int, MAX_SHADER_NESTED_LOOPS> aL;
-		Array<Int, MAX_SHADER_NESTED_LOOPS> increment;
-		Array<Int, MAX_SHADER_NESTED_LOOPS> iteration;
+		Array<Int> aL; // loop counter register
+		Array<Int> increment; // increment value per loop
+		Array<Int> iteration; // iteration count
 
 		Int loopDepth;    // FIXME: Add support for switch
 		Int stackIndex;   // FIXME: Inc/decrement callStack
-		Array<UInt, MAX_SHADER_CALL_STACK_SIZE> callStack;
+		Array<UInt> callStack;
 
 		// Per pixel based on conditions reached
 		Int enableIndex;
@@ -153,18 +134,18 @@
 		void RET();
 		void LEAVE();
 
-		BoundedIndex<MAX_SHADER_NESTED_IFS> ifDepth = 0;
-		BoundedIndex<MAX_SHADER_NESTED_LOOPS> loopRepDepth = 0;
-		BoundedIndex<MAX_SHADER_CALL_SITES> currentLabel = -1;
+		BoundedIndex ifDepth = 0;
+		BoundedIndex loopRepDepth = 0;
+		BoundedIndex currentLabel = -1;
 		bool scalar = false;
 
-		BasicBlock *ifFalseBlock[MAX_SHADER_NESTED_IFS];
-		BasicBlock *loopRepTestBlock[MAX_SHADER_NESTED_LOOPS];
-		BasicBlock *loopRepEndBlock[MAX_SHADER_NESTED_LOOPS];
-		BasicBlock *labelBlock[MAX_SHADER_CALL_SITES];
-		std::vector<BasicBlock*> callRetBlock[MAX_SHADER_CALL_SITES];
+		std::vector<BasicBlock*> ifFalseBlock;
+		std::vector<BasicBlock*> loopRepTestBlock;
+		std::vector<BasicBlock*> loopRepEndBlock;
+		std::vector<BasicBlock*> labelBlock;
+		std::unordered_map<unsigned int, std::vector<BasicBlock*>> callRetBlock; // label -> list of call sites
 		BasicBlock *returnBlock;
-		bool isConditionalIf[MAX_SHADER_NESTED_IFS];
+		std::vector<bool> isConditionalIf;
 		std::vector<Int4> restoreContinue;
 	};
 }
diff --git a/src/Shader/PixelShader.cpp b/src/Shader/PixelShader.cpp
index d24e7c2..1af50d8 100644
--- a/src/Shader/PixelShader.cpp
+++ b/src/Shader/PixelShader.cpp
@@ -161,6 +161,7 @@
 		analyzeSamplers();
 		analyzeCallSites();
 		analyzeIndirectAddressing();
+		analyzeLimits();
 	}
 
 	void PixelShader::analyzeZOverride()
diff --git a/src/Shader/Shader.cpp b/src/Shader/Shader.cpp
index db2b2bc..3528b1e 100644
--- a/src/Shader/Shader.cpp
+++ b/src/Shader/Shader.cpp
@@ -19,10 +19,14 @@
 #include "Common/Math.hpp"
 #include "Common/Debug.hpp"
 
+#include <algorithm>
 #include <set>
 #include <fstream>
+#include <functional>
 #include <sstream>
 #include <stdarg.h>
+#include <unordered_map>
+#include <unordered_set>
 
 namespace sw
 {
@@ -1877,15 +1881,13 @@
 	// This is used to know what basic block to return to.
 	void Shader::analyzeCallSites()
 	{
-		int callSiteIndex[MAX_SHADER_CALL_SITES] = {0};
+		std::unordered_map<int, int> callSiteIndices;
 
 		for(auto &inst : instruction)
 		{
 			if(inst->opcode == OPCODE_CALL || inst->opcode == OPCODE_CALLNZ)
 			{
-				int label = sw::min(inst->dst.label, static_cast<unsigned int>(MAX_SHADER_CALL_SITES));
-
-				inst->dst.callSite = callSiteIndex[label]++;
+				inst->dst.callSite = callSiteIndices[inst->dst.label]++;
 			}
 		}
 	}
@@ -1924,4 +1926,177 @@
 			}
 		}
 	}
+
+	// analyzeLimits analyzes the whole shader program to determine the deepest
+	// nesting of control flow blocks and function calls. These calculations
+	// are stored into the limits member, and is used by the programs to
+	// allocate stack storage variables.
+	void Shader::analyzeLimits()
+	{
+		typedef unsigned int FunctionID;
+
+		// Identifier of the function with the main entry point.
+		constexpr FunctionID MAIN_ID = 0xF0000000;
+
+		// Invalid function identifier.
+		constexpr FunctionID INVALID_ID = ~0U;
+
+		// Limits on a single function.
+		struct FunctionLimits
+		{
+			uint32_t loops = 0; // maximum nested loop and reps.
+			uint32_t ifs = 0; // maximum nested if statements.
+			uint32_t stack = 0; // maximum call depth.
+		};
+
+		// Information about a single function in the shader.
+		struct FunctionInfo
+		{
+			FunctionLimits limits;
+			std::unordered_set<FunctionID> calls; // What this function calls.
+			bool reachable; // Is this function reachable?
+		};
+
+		// Begin by doing a pass over the instructions to identify all the
+		// labels that denote the start of a function.
+		std::unordered_map<FunctionID, FunctionInfo> functions;
+
+		for(auto &inst : instruction)
+		{
+			if (inst->isCall())
+			{
+				FunctionID id = inst->dst.label;
+				ASSERT(id != MAIN_ID); // If this fires, we're going to have to represent main with something else.
+				functions[id] = FunctionInfo();
+			}
+		}
+
+		// Add a definition for the main entry point.
+		// This starts at the beginning of the instructions and does not have
+		// its own label.
+		functions[MAIN_ID] = FunctionInfo();
+		functions[MAIN_ID].reachable = true;
+		FunctionID currentFunc = MAIN_ID;
+
+		// Limits for the currently analyzed function.
+		FunctionLimits currentLimits;
+
+		// Now loop over the instructions gathering the limits of each of the
+		// functions.
+		for(size_t i = 0; i < instruction.size(); i++)
+		{
+			const auto& inst = instruction[i];
+			switch (inst->opcode)
+			{
+				case OPCODE_LABEL:
+				{
+					FunctionID id = inst->dst.label;
+					auto it = functions.find(id);
+					if (it == functions.end())
+					{
+						// Label does not start a new function.
+						continue;
+					}
+					ASSERT(currentFunc == INVALID_ID); // Functions overlap
+					currentFunc = id;
+					break;
+				}
+				case OPCODE_CALL:
+				case OPCODE_CALLNZ:
+				{
+					ASSERT(currentFunc != INVALID_ID);
+					FunctionID id = inst->dst.label;
+					functions[currentFunc].calls.emplace(id);
+					functions[id].reachable = true;
+					break;
+				}
+				case OPCODE_LOOP:
+				case OPCODE_REP:
+				case OPCODE_WHILE:
+				case OPCODE_SWITCH: // Not a mistake - switches share loopReps.
+				{
+					ASSERT(currentFunc != INVALID_ID);
+					++currentLimits.loops;
+					auto& func = functions[currentFunc];
+					func.limits.loops = std::max(func.limits.loops, currentLimits.loops);
+					break;
+				}
+				case OPCODE_ENDLOOP:
+				case OPCODE_ENDREP:
+				case OPCODE_ENDWHILE:
+				case OPCODE_ENDSWITCH:
+				{
+					ASSERT(currentLimits.loops > 0);
+					--currentLimits.loops;
+					break;
+				}
+				case OPCODE_IF:
+				case OPCODE_IFC:
+				{
+					ASSERT(currentFunc != INVALID_ID);
+					++currentLimits.ifs;
+					auto& func = functions[currentFunc];
+					func.limits.ifs = std::max(func.limits.ifs, currentLimits.ifs);
+					break;
+				}
+				case OPCODE_ENDIF:
+				{
+					ASSERT(currentLimits.ifs > 0);
+					currentLimits.ifs--;
+					break;
+				}
+				case OPCODE_RET:
+				{
+					// Must be in a function to return.
+					ASSERT(currentFunc != INVALID_ID);
+
+					// All stacks should be popped before returning.
+					ASSERT(currentLimits.ifs == 0);
+					ASSERT(currentLimits.loops == 0);
+
+					currentFunc = INVALID_ID;
+					currentLimits = FunctionLimits();
+					break;
+				}
+				default:
+					break;
+			}
+		}
+
+		// Assert that every function is reachable (these should have been
+		// stripped in earlier stages). Unreachable functions may be code
+		// generated, but their own limits are not considered below, potentially
+		// causing OOB indexing in later stages.
+		for (auto it : functions) { ASSERT(it.second.reachable); }
+
+		// We have now gathered all the information about each of the functions
+		// in the shader. Traverse these functions starting from the main
+		// function to calculate the maximum limits across the entire shader.
+
+		std::unordered_set<FunctionID> visited;
+		std::function<Limits(FunctionID)> traverse;
+		traverse = [&](FunctionID id) -> Limits
+		{
+			const auto& func = functions[id];
+			ASSERT(visited.count(id) == 0); // Sanity check: Recursive functions are not allowed.
+			visited.insert(id);
+			Limits limits;
+			limits.stack = 1;
+			for (auto callee : func.calls)
+			{
+				auto calleeLimits = traverse(callee);
+				limits.loops = std::max(limits.loops, calleeLimits.loops);
+				limits.ifs = std::max(limits.ifs, calleeLimits.ifs);
+				limits.stack = std::max(limits.stack, calleeLimits.stack + 1);
+			}
+			visited.erase(id);
+
+			limits.loops += func.limits.loops;
+			limits.ifs += func.limits.ifs;
+			return limits;
+		};
+
+		limits = traverse(MAIN_ID);
+		limits.functions = (uint32_t)functions.size();
+	}
 }
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index ada88ba..64a86cd 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -35,6 +35,7 @@
 		enum Opcode
 		{
 			// Matches order in d3d9types.h
+			// See https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/content/d3d9types/ne-d3d9types-_d3dshader_instruction_opcode_type
 			OPCODE_NOP = 0,
 			OPCODE_MOV,
 			OPCODE_ADD,
@@ -554,6 +555,15 @@
 			};
 		};
 
+		// Limits holds the maximum nested counts for the shader.
+		struct Limits
+		{
+			uint32_t loops = 0; // maximum nested loop and reps.
+			uint32_t ifs = 0; // maximum nested if statements.
+			uint32_t stack = 0; // maximum call depth.
+			uint32_t functions = 0; // total number of functions.
+		};
+
 		Shader();
 
 		virtual ~Shader();
@@ -562,6 +572,7 @@
 		size_t getLength() const;
 		ShaderType getShaderType() const;
 		unsigned short getShaderModel() const;
+		inline const Limits& getLimits() const { return limits; }
 
 		void append(Instruction *instruction);
 		void declareSampler(int i);
@@ -629,8 +640,11 @@
 		void analyzeSamplers();
 		void analyzeCallSites();
 		void analyzeIndirectAddressing();
+		void analyzeLimits();
 		void markFunctionAnalysis(unsigned int functionLabel, Analysis flag);
 
+		Limits limits; // Calculated in analyzeLimits().
+
 		ShaderType shaderType;
 
 		union
diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index 8bae9cf..0463ea6 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -24,12 +24,24 @@
 namespace sw
 {
 	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader)
-		: VertexRoutine(state, shader), shader(shader), r(shader->indirectAddressableTemporaries)
+		: VertexRoutine(state, shader),
+		  shader(shader),
+		  r(shader->indirectAddressableTemporaries),
+		  aL(shader->getLimits().loops),
+		  increment(shader->getLimits().loops),
+		  iteration(shader->getLimits().loops),
+		  callStack(shader->getLimits().stack)
 	{
-		for(int i = 0; i < MAX_SHADER_CALL_SITES; i++)
-		{
-			labelBlock[i] = 0;
-		}
+		auto limits = shader->getLimits();
+		ifDepth.setLimit(limits.ifs);
+		loopRepDepth.setLimit(limits.loops);
+		currentLabel.setLimit(limits.functions);
+
+		ifFalseBlock.resize(limits.ifs);
+		loopRepTestBlock.resize(limits.loops);
+		loopRepEndBlock.resize(limits.loops);
+		labelBlock.resize(limits.functions);
+		isConditionalIf.resize(limits.ifs);
 
 		loopDepth = -1;
 		enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
diff --git a/src/Shader/VertexProgram.hpp b/src/Shader/VertexProgram.hpp
index 379abce..3f794fa 100644
--- a/src/Shader/VertexProgram.hpp
+++ b/src/Shader/VertexProgram.hpp
@@ -22,6 +22,8 @@
 #include "Renderer/Stream.hpp"
 #include "Common/Types.hpp"
 
+#include <unordered_map>
+
 namespace sw
 {
 	struct Stream;
@@ -39,15 +41,15 @@
 
 		RegisterArray<NUM_TEMPORARY_REGISTERS> r;   // Temporary registers
 		Vector4f a0;
-		Array<Int, MAX_SHADER_NESTED_LOOPS> aL;
+		Array<Int> aL; // loop counter register
 		Vector4f p0;
 
-		Array<Int, MAX_SHADER_NESTED_LOOPS> increment;
-		Array<Int, MAX_SHADER_NESTED_LOOPS> iteration;
+		Array<Int> increment;
+		Array<Int> iteration;
 
 		Int loopDepth;
 		Int stackIndex;   // FIXME: Inc/decrement callStack
-		Array<UInt, MAX_SHADER_CALL_STACK_SIZE> callStack;
+		Array<UInt> callStack;
 
 		Int enableIndex;
 		Array<Int4, MAX_SHADER_ENABLE_STACK_SIZE> enableStack;
@@ -122,18 +124,18 @@
 		Vector4f sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
 		Vector4f sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
 
-		BoundedIndex<MAX_SHADER_NESTED_IFS> ifDepth = 0;
-		BoundedIndex<MAX_SHADER_NESTED_LOOPS> loopRepDepth = 0;
-		BoundedIndex<MAX_SHADER_CALL_SITES> currentLabel = -1;
+		BoundedIndex ifDepth = 0;
+		BoundedIndex loopRepDepth = 0;
+		BoundedIndex currentLabel = -1;
 		bool scalar = false;
 
-		BasicBlock *ifFalseBlock[MAX_SHADER_NESTED_IFS];
-		BasicBlock *loopRepTestBlock[MAX_SHADER_NESTED_LOOPS];
-		BasicBlock *loopRepEndBlock[MAX_SHADER_NESTED_LOOPS];
-		BasicBlock *labelBlock[MAX_SHADER_CALL_SITES];
-		std::vector<BasicBlock*> callRetBlock[MAX_SHADER_CALL_SITES];
+		std::vector<BasicBlock*> ifFalseBlock;
+		std::vector<BasicBlock*> loopRepTestBlock;
+		std::vector<BasicBlock*> loopRepEndBlock;
+		std::vector<BasicBlock*> labelBlock;
+		std::unordered_map<unsigned int, std::vector<BasicBlock*>> callRetBlock; // label -> list of call sites
 		BasicBlock *returnBlock;
-		bool isConditionalIf[MAX_SHADER_NESTED_IFS];
+		std::vector<bool> isConditionalIf;
 		std::vector<Int4> restoreContinue;
 	};
 }
diff --git a/src/Shader/VertexShader.cpp b/src/Shader/VertexShader.cpp
index 8f1c4f8..7b2455c 100644
--- a/src/Shader/VertexShader.cpp
+++ b/src/Shader/VertexShader.cpp
@@ -208,6 +208,7 @@
 		analyzeSamplers();
 		analyzeCallSites();
 		analyzeIndirectAddressing();
+		analyzeLimits();
 	}
 
 	void VertexShader::analyzeInput()