Implement gather/scatter operations for shader register files.

This allows to address the registers with a vector of indices.

Also rename 'dynamic' register files to 'indirect addressable', to
disambiguate from 'dynamic indexing' at the shader level. Indexing with
a uniform does not require gather/scatter operations, but does require
indirect addressing.

Bug chromium:845103
Bug skia:7846

Change-Id: I3c42be33def66328688f2900c61c80246bf1e584
Reviewed-on: https://swiftshader-review.googlesource.com/18989
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Shader/PixelProgram.hpp b/src/Shader/PixelProgram.hpp
index 1f60bde..ef6c2c0 100644
--- a/src/Shader/PixelProgram.hpp
+++ b/src/Shader/PixelProgram.hpp
@@ -24,7 +24,7 @@
 	{
 	public:
 		PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
-			PixelRoutine(state, shader), r(shader->dynamicallyIndexedTemporaries),
+			PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries),
 			loopDepth(-1), ifDepth(0), loopRepDepth(0), currentLabel(-1), whileTest(false)
 		{
 			for(int i = 0; i < 2048; ++i)
diff --git a/src/Shader/PixelRoutine.cpp b/src/Shader/PixelRoutine.cpp
index 1c300b0..146e42d 100644
--- a/src/Shader/PixelRoutine.cpp
+++ b/src/Shader/PixelRoutine.cpp
@@ -29,7 +29,8 @@
 	extern bool exactColorRounding;
 	extern bool forceClearRegisters;
 
-	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
+	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
+		: QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
 	{
 		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
 		{
diff --git a/src/Shader/PixelShader.cpp b/src/Shader/PixelShader.cpp
index 9e281d9..d24e7c2 100644
--- a/src/Shader/PixelShader.cpp
+++ b/src/Shader/PixelShader.cpp
@@ -160,7 +160,7 @@
 		analyzeDynamicBranching();
 		analyzeSamplers();
 		analyzeCallSites();
-		analyzeDynamicIndexing();
+		analyzeIndirectAddressing();
 	}
 
 	void PixelShader::analyzeZOverride()
diff --git a/src/Shader/Shader.cpp b/src/Shader/Shader.cpp
index 6874051..36192c9 100644
--- a/src/Shader/Shader.cpp
+++ b/src/Shader/Shader.cpp
@@ -1890,40 +1890,34 @@
 		}
 	}
 
-	void Shader::analyzeDynamicIndexing()
+	void Shader::analyzeIndirectAddressing()
 	{
-		dynamicallyIndexedTemporaries = false;
-		dynamicallyIndexedInput = false;
-		dynamicallyIndexedOutput = false;
+		indirectAddressableTemporaries = false;
+		indirectAddressableInput = false;
+		indirectAddressableOutput = false;
 
 		for(const auto &inst : instruction)
 		{
-			if(inst->dst.rel.type == PARAMETER_ADDR ||
-			   inst->dst.rel.type == PARAMETER_LOOP ||
-			   inst->dst.rel.type == PARAMETER_TEMP ||
-			   inst->dst.rel.type == PARAMETER_CONST)
+			if(inst->dst.rel.type != PARAMETER_VOID)
 			{
 				switch(inst->dst.type)
 				{
-				case PARAMETER_TEMP:   dynamicallyIndexedTemporaries = true; break;
-				case PARAMETER_INPUT:  dynamicallyIndexedInput = true;       break;
-				case PARAMETER_OUTPUT: dynamicallyIndexedOutput = true;      break;
+				case PARAMETER_TEMP:   indirectAddressableTemporaries = true; break;
+				case PARAMETER_INPUT:  indirectAddressableInput = true;       break;
+				case PARAMETER_OUTPUT: indirectAddressableOutput = true;      break;
 				default: break;
 				}
 			}
 
 			for(int j = 0; j < 3; j++)
 			{
-				if(inst->src[j].rel.type == PARAMETER_ADDR ||
-				   inst->src[j].rel.type == PARAMETER_LOOP ||
-				   inst->src[j].rel.type == PARAMETER_TEMP ||
-				   inst->src[j].rel.type == PARAMETER_CONST)
+				if(inst->src[j].rel.type != PARAMETER_VOID)
 				{
 					switch(inst->src[j].type)
 					{
-					case PARAMETER_TEMP:   dynamicallyIndexedTemporaries = true; break;
-					case PARAMETER_INPUT:  dynamicallyIndexedInput = true;       break;
-					case PARAMETER_OUTPUT: dynamicallyIndexedOutput = true;      break;
+					case PARAMETER_TEMP:   indirectAddressableTemporaries = true; break;
+					case PARAMETER_INPUT:  indirectAddressableInput = true;       break;
+					case PARAMETER_OUTPUT: indirectAddressableOutput = true;      break;
 					default: break;
 					}
 				}
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index 6755cd4..6d431f5 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -612,9 +612,9 @@
 		unsigned int dirtyConstantsI;
 		unsigned int dirtyConstantsB;
 
-		bool dynamicallyIndexedTemporaries;
-		bool dynamicallyIndexedInput;
-		bool dynamicallyIndexedOutput;
+		bool indirectAddressableTemporaries;
+		bool indirectAddressableInput;
+		bool indirectAddressableOutput;
 
 	protected:
 		void parse(const unsigned long *token);
@@ -627,7 +627,7 @@
 		void analyzeDynamicBranching();
 		void analyzeSamplers();
 		void analyzeCallSites();
-		void analyzeDynamicIndexing();
+		void analyzeIndirectAddressing();
 		void markFunctionAnalysis(unsigned int functionLabel, Analysis flag);
 
 		ShaderType shaderType;
diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index 338605c..4ea3260 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -560,6 +560,100 @@
 		}
 	}
 
+	const Vector4f RegisterFile::operator[](RValue<Int4> index)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		Vector4f r;
+
+		r.x.x = Extract(x[0][index0], 0);
+		r.x.y = Extract(x[0][index1], 1);
+		r.x.z = Extract(x[0][index2], 2);
+		r.x.w = Extract(x[0][index3], 3);
+
+		r.y.x = Extract(y[0][index0], 0);
+		r.y.y = Extract(y[0][index1], 1);
+		r.y.z = Extract(y[0][index2], 2);
+		r.y.w = Extract(y[0][index3], 3);
+
+		r.z.x = Extract(z[0][index0], 0);
+		r.z.y = Extract(z[0][index1], 1);
+		r.z.z = Extract(z[0][index2], 2);
+		r.z.w = Extract(z[0][index3], 3);
+
+		r.w.x = Extract(w[0][index0], 0);
+		r.w.y = Extract(w[0][index1], 1);
+		r.w.z = Extract(w[0][index2], 2);
+		r.w.w = Extract(w[0][index3], 3);
+
+		return r;
+	}
+
+	void RegisterFile::scatter_x(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0);
+		x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1);
+		x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2);
+		x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_y(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0);
+		y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1);
+		y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2);
+		y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_z(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0);
+		z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1);
+		z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2);
+		z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_w(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0);
+		w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1);
+		w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2);
+		w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3);
+	}
+
 	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
 	{
 		if(integerDestination)
diff --git a/src/Shader/ShaderCore.hpp b/src/Shader/ShaderCore.hpp
index 249e058..4dc109f 100644
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -147,31 +147,30 @@
 		Reference<Float4> w;
 	};
 
-	template<int S, bool D = false>
-	class RegisterArray
+	class RegisterFile
 	{
 	public:
-		RegisterArray(bool dynamic = D) : dynamic(dynamic)
+		RegisterFile(int size, bool indirectAddressable) : size(size), indirectAddressable(indirectAddressable)
 		{
-			if(dynamic)
+			if(indirectAddressable)
 			{
-				x = new Array<Float4>(S);
-				y = new Array<Float4>(S);
-				z = new Array<Float4>(S);
-				w = new Array<Float4>(S);
+				x = new Array<Float4>(size);
+				y = new Array<Float4>(size);
+				z = new Array<Float4>(size);
+				w = new Array<Float4>(size);
 			}
 			else
 			{
-				x = new Array<Float4>[S];
-				y = new Array<Float4>[S];
-				z = new Array<Float4>[S];
-				w = new Array<Float4>[S];
+				x = new Array<Float4>[size];
+				y = new Array<Float4>[size];
+				z = new Array<Float4>[size];
+				w = new Array<Float4>[size];
 			}
 		}
 
-		~RegisterArray()
+		~RegisterFile()
 		{
-			if(dynamic)
+			if(indirectAddressable)
 			{
 				delete x;
 				delete y;
@@ -189,7 +188,7 @@
 
 		Register operator[](int i)
 		{
-			if(dynamic)
+			if(indirectAddressable)
 			{
 				return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
 			}
@@ -201,19 +200,36 @@
 
 		Register operator[](RValue<Int> i)
 		{
-			ASSERT(dynamic);
+			ASSERT(indirectAddressable);
 
 			return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
 		}
 
-	private:
-		const bool dynamic;
+		const Vector4f operator[](RValue<Int4> i);   // Gather operation (read only).
+
+		void scatter_x(Int4 i, RValue<Float4> r);
+		void scatter_y(Int4 i, RValue<Float4> r);
+		void scatter_z(Int4 i, RValue<Float4> r);
+		void scatter_w(Int4 i, RValue<Float4> r);
+
+	protected:
+		const int size;
+		const bool indirectAddressable;
 		Array<Float4> *x;
 		Array<Float4> *y;
 		Array<Float4> *z;
 		Array<Float4> *w;
 	};
 
+	template<int S, bool I = false>
+	class RegisterArray : public RegisterFile
+	{
+	public:
+		RegisterArray(bool indirectAddressable = I) : RegisterFile(S, indirectAddressable)
+		{
+		}
+	};
+
 	class ShaderCore
 	{
 		typedef Shader::Control Control;
diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index 4f8ba1a..8dbd600 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -24,7 +24,7 @@
 namespace sw
 {
 	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader)
-		: VertexRoutine(state, shader), shader(shader), r(shader->dynamicallyIndexedTemporaries)
+		: VertexRoutine(state, shader), shader(shader), r(shader->indirectAddressableTemporaries)
 	{
 		ifDepth = 0;
 		loopRepDepth = 0;
diff --git a/src/Shader/VertexRoutine.cpp b/src/Shader/VertexRoutine.cpp
index 2d7c2c6..9b8d336 100644
--- a/src/Shader/VertexRoutine.cpp
+++ b/src/Shader/VertexRoutine.cpp
@@ -27,8 +27,8 @@
 	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
 
 	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
-		: v(shader && shader->dynamicallyIndexedInput),
-		  o(shader && shader->dynamicallyIndexedOutput),
+		: v(shader && shader->indirectAddressableInput),
+		  o(shader && shader->indirectAddressableOutput),
 		  state(state)
 	{
 	}
diff --git a/src/Shader/VertexShader.cpp b/src/Shader/VertexShader.cpp
index 33c2241..8f1c4f8 100644
--- a/src/Shader/VertexShader.cpp
+++ b/src/Shader/VertexShader.cpp
@@ -176,7 +176,7 @@
 		setOutput(posReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_POSITION, 0));
 		positionRegister = posReg;
 	}
-	
+
 	void VertexShader::setPointSizeRegister(int ptSizeReg)
 	{
 		setOutput(ptSizeReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_PSIZE, 0));
@@ -207,7 +207,7 @@
 		analyzeDynamicBranching();
 		analyzeSamplers();
 		analyzeCallSites();
-		analyzeDynamicIndexing();
+		analyzeIndirectAddressing();
 	}
 
 	void VertexShader::analyzeInput()