Fix deterministic loops within conditional blocks, again.

Deterministic loops use the first scalar of the SIMD register used as
the loop index, for addressing arrays. This means that operations on the
index register should not be masked (i.e. it should be treated as a
scalar).

Previously we were still masking it based on conditional statements, and
we didn't disable the masking altogether for the loop initialization and
initial test. A new shader assembly instruction 'SCALAR' was added for
doing this.

Previously this was conflated with the 'TEST' instruction, which should
independently disable/restore the 'continue' mask.

Bug swiftshader:93
Bug b/118009174

Change-Id: I4add1a6d74231f463217e57adfabdc81faf489ae
Reviewed-on: https://swiftshader-review.googlesource.com/c/22348
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/OpenGL/compiler/OutputASM.cpp b/src/OpenGL/compiler/OutputASM.cpp
index 5a8bcdf..308da1a 100644
--- a/src/OpenGL/compiler/OutputASM.cpp
+++ b/src/OpenGL/compiler/OutputASM.cpp
@@ -1859,6 +1859,11 @@
 		if(loop.isDeterministic())
 		{
 			 deterministicVariables.insert(loop.index->getId());
+
+			 if(!unroll)
+			 {
+				 emit(sw::Shader::OPCODE_SCALAR);   // Unrolled loops don't have an ENDWHILE to disable scalar mode.
+			 }
 		}
 
 		if(node->getType() == ELoopDoWhile)
@@ -1926,6 +1931,11 @@
 
 				emit(sw::Shader::OPCODE_TEST);
 
+				if(loop.isDeterministic())
+				{
+					emit(sw::Shader::OPCODE_SCALAR);
+				}
+
 				if(expression)
 				{
 					expression->traverse(this);
diff --git a/src/Shader/PixelProgram.cpp b/src/Shader/PixelProgram.cpp
index ec0bba0..7dff00b 100644
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -310,6 +310,7 @@
 			case Shader::OPCODE_BREAKP:     BREAKP(src0);                                  break;
 			case Shader::OPCODE_CONTINUE:   CONTINUE();                                    break;
 			case Shader::OPCODE_TEST:       TEST();                                        break;
+			case Shader::OPCODE_SCALAR:     SCALAR();                                      break;
 			case Shader::OPCODE_CALL:       CALL(dst.label, dst.callSite);                 break;
 			case Shader::OPCODE_CALLNZ:     CALLNZ(dst.label, dst.callSite, src0);         break;
 			case Shader::OPCODE_ELSE:       ELSE();                                        break;
@@ -832,6 +833,11 @@
 
 	Int4 PixelProgram::enableMask(const Shader::Instruction *instruction)
 	{
+		if(scalar)
+		{
+			return Int4(0xFFFFFFFF);
+		}
+
 		Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
 
 		if(shader->containsBreakInstruction() && instruction->analysisBreak)
@@ -1396,6 +1402,11 @@
 		restoreContinue.pop_back();
 	}
 
+	void PixelProgram::SCALAR()
+	{
+		scalar = true;
+	}
+
 	void PixelProgram::CALL(int labelIndex, int callSiteIndex)
 	{
 		if(!labelBlock[labelIndex])
@@ -1572,6 +1583,7 @@
 		Nucleus::setInsertBlock(endBlock);
 
 		enableIndex--;
+		scalar = false;
 	}
 
 	void PixelProgram::ENDSWITCH()
@@ -1777,6 +1789,7 @@
 		Nucleus::setInsertBlock(loopBlock);
 
 		loopRepDepth++;
+		scalar = false;
 	}
 
 	void PixelProgram::SWITCH()
diff --git a/src/Shader/PixelProgram.hpp b/src/Shader/PixelProgram.hpp
index 0f628bc..3c3a06f 100644
--- a/src/Shader/PixelProgram.hpp
+++ b/src/Shader/PixelProgram.hpp
@@ -24,14 +24,14 @@
 	{
 	public:
 		PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
-			PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries),
-			loopDepth(-1), ifDepth(0), loopRepDepth(0), currentLabel(-1)
+			PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries)
 		{
 			for(int i = 0; i < 2048; ++i)
 			{
 				labelBlock[i] = 0;
 			}
 
+			loopDepth = -1;
 			enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 
 			if(shader->containsBreakInstruction())
@@ -129,6 +129,7 @@
 		void BREAK(Int4 &condition);
 		void CONTINUE();
 		void TEST();
+		void SCALAR();
 		void CALL(int labelIndex, int callSiteIndex);
 		void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
 		void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
@@ -152,9 +153,10 @@
 		void RET();
 		void LEAVE();
 
-		int ifDepth;
-		int loopRepDepth;
-		int currentLabel;
+		int ifDepth = 0;
+		int loopRepDepth = 0;
+		int currentLabel = -1;
+		bool scalar = false;
 
 		BasicBlock *ifFalseBlock[24 + 24];
 		BasicBlock *loopRepTestBlock[4];
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index 9e4a810..ada88ba 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -244,9 +244,10 @@
 			OPCODE_INSERT,
 			OPCODE_DISCARD,
 			OPCODE_FWIDTH,
-			OPCODE_LEAVE,   // Return before the end of the function
+			OPCODE_LEAVE,    // Return before the end of the function
 			OPCODE_CONTINUE,
-			OPCODE_TEST,   // Marks the end of the code that can be skipped by 'continue'
+			OPCODE_TEST,     // Marks the end of the code that can be skipped by 'continue'
+			OPCODE_SCALAR,   // Marks the start of code not subject to SIMD lane masking. Ends at WHILE and ENDWHILE.
 			OPCODE_SWITCH,
 			OPCODE_ENDSWITCH,
 
diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index d492c65..55cc8c8 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -26,10 +26,6 @@
 	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader)
 		: VertexRoutine(state, shader), shader(shader), r(shader->indirectAddressableTemporaries)
 	{
-		ifDepth = 0;
-		loopRepDepth = 0;
-		currentLabel = -1;
-
 		for(int i = 0; i < 2048; i++)
 		{
 			labelBlock[i] = 0;
@@ -295,6 +291,7 @@
 			case Shader::OPCODE_BREAKP:     BREAKP(src0);                   break;
 			case Shader::OPCODE_CONTINUE:   CONTINUE();                     break;
 			case Shader::OPCODE_TEST:       TEST();                         break;
+			case Shader::OPCODE_SCALAR:     SCALAR();                       break;
 			case Shader::OPCODE_CALL:       CALL(dst.label, dst.callSite);  break;
 			case Shader::OPCODE_CALLNZ:     CALLNZ(dst.label, dst.callSite, src0); break;
 			case Shader::OPCODE_ELSE:       ELSE();                         break;
@@ -977,6 +974,11 @@
 
 	Int4 VertexProgram::enableMask(const Shader::Instruction *instruction)
 	{
+		if(scalar)
+		{
+			return Int4(0xFFFFFFFF);
+		}
+
 		Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
 
 		if(shader->containsBreakInstruction() && instruction->analysisBreak)
@@ -1108,6 +1110,11 @@
 		restoreContinue.pop_back();
 	}
 
+	void VertexProgram::SCALAR()
+	{
+		scalar = true;
+	}
+
 	void VertexProgram::CALL(int labelIndex, int callSiteIndex)
 	{
 		if(!labelBlock[labelIndex])
@@ -1284,6 +1291,7 @@
 		Nucleus::setInsertBlock(endBlock);
 
 		enableIndex--;
+		scalar = false;
 	}
 
 	void VertexProgram::ENDSWITCH()
@@ -1490,6 +1498,7 @@
 		Nucleus::setInsertBlock(loopBlock);
 
 		loopRepDepth++;
+		scalar = false;
 	}
 
 	void VertexProgram::SWITCH()
diff --git a/src/Shader/VertexProgram.hpp b/src/Shader/VertexProgram.hpp
index 437c881..33f3c5c 100644
--- a/src/Shader/VertexProgram.hpp
+++ b/src/Shader/VertexProgram.hpp
@@ -86,6 +86,7 @@
 		void BREAK(Int4 &condition);
 		void CONTINUE();
 		void TEST();
+		void SCALAR();
 		void CALL(int labelIndex, int callSiteIndex);
 		void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
 		void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
@@ -121,9 +122,10 @@
 		Vector4f sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
 		Vector4f sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
 
-		int ifDepth;
-		int loopRepDepth;
-		int currentLabel;
+		int ifDepth = 0;
+		int loopRepDepth = 0;
+		int currentLabel = -1;
+		bool scalar = false;
 
 		BasicBlock *ifFalseBlock[24 + 24];
 		BasicBlock *loopRepTestBlock[4];