SpirvShader: Implement OpControlBarrier.

Use the new coroutines to yield when hitting a ControlBarrier.
A barrier pushes the shader subgroup to the end of the compute invocation queue, which forces all subgroups to be brought to the fence before continuing.

Tests: dEQP-VK.spirv_assembly.instruction.compute.workgroup_memory.*
Tests: dEQP-VK.subgroups.basic.compute.*
Tests: dEQP-VK.compute.basic.*

Bug: b/131672705
Bug: b/132232716
Change-Id: Id78be9ce9d9455cb2cb7254482568985845b8b6a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30851
Presubmit-Ready: Ben Clayton <bclayton@google.com>
Tested-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index 361cbb1..081831c 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -18,6 +18,8 @@
 #include "Vulkan/VkDebug.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
 
+#include <queue>
+
 namespace
 {
 	enum { X, Y, Z };
@@ -154,17 +156,18 @@
 
 	void ComputeProgram::emit()
 	{
+		Int workgroupX = Arg<1>();
+		Int workgroupY = Arg<2>();
+		Int workgroupZ = Arg<3>();
+		Pointer<Byte> workgroupMemory = Arg<4>();
+		Int firstSubgroup = Arg<5>();
+		Int subgroupCount = Arg<6>();
+
 		routine.descriptorSets = data + OFFSET(Data, descriptorSets);
 		routine.descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
 		routine.pushConstants = data + OFFSET(Data, pushConstants);
 		routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
-		routine.workgroupMemory = *Pointer<Pointer<Byte>>(data + OFFSET(Data, workgroupMemory));
-
-		Int workgroupX = Arg<1>();
-		Int workgroupY = Arg<2>();
-		Int workgroupZ = Arg<3>();
-		Int firstSubgroup = Arg<4>();
-		Int subgroupCount = Arg<5>();
+		routine.workgroupMemory = workgroupMemory;
 
 		Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
 
@@ -210,8 +213,8 @@
 		auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
 
 		// We're sharing a buffer here across all workgroups.
-		// We can only do this because we know workgroups are executed
-		// serially.
+		// We can only do this because we know a single workgroup is in flight
+		// at any time.
 		std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
 
 		Data data;
@@ -230,19 +233,51 @@
 		data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
 		data.pushConstants = pushConstants;
 		data.constants = &sw::constants;
-		data.workgroupMemory = workgroupMemory.data();
 
-		// TODO(bclayton): Split work across threads.
 		for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++)
 		{
 			for (uint32_t groupY = 0; groupY < groupCountY; groupY++)
 			{
 				for (uint32_t groupX = 0; groupX < groupCountX; groupX++)
 				{
-					(*this)(&data, groupX, groupY, groupZ, 0, subgroupsPerWorkgroup);
-				}
-			}
-		}
+
+					// TODO(bclayton): Split work across threads.
+					using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
+					std::queue<Coroutine> coroutines;
+
+					if (shader->getModes().ContainsControlBarriers)
+					{
+						// Make a function call per subgroup so each subgroup
+						// can yield, bringing all subgroups to the barrier
+						// together.
+						for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
+						{
+							auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
+							coroutines.push(std::move(coroutine));
+						}
+					}
+					else
+					{
+						auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
+						coroutines.push(std::move(coroutine));
+					}
+
+					while (coroutines.size() > 0)
+					{
+						auto coroutine = std::move(coroutines.front());
+						coroutines.pop();
+
+						SpirvShader::YieldResult result;
+						if (coroutine->await(result))
+						{
+							// TODO: Consider result (when the enum is more than 1 entry).
+							coroutines.push(std::move(coroutine));
+						}
+					}
+
+				} // groupX
+			} // groupY
+		} // groupZ
 	}
 
 } // namespace sw
diff --git a/src/Pipeline/ComputeProgram.hpp b/src/Pipeline/ComputeProgram.hpp
index b2a3785..762a44b 100644
--- a/src/Pipeline/ComputeProgram.hpp
+++ b/src/Pipeline/ComputeProgram.hpp
@@ -37,11 +37,12 @@
 	struct Constants;
 
 	// ComputeProgram builds a SPIR-V compute shader.
-	class ComputeProgram : public Coroutine<int(
+	class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
 			void* data,
 			int32_t workgroupX,
 			int32_t workgroupY,
 			int32_t workgroupZ,
+			void* workgroupMemory,
 			int32_t firstSubgroup,
 			int32_t subgroupCount)>
 	{
@@ -80,7 +81,6 @@
 			uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
 			PushConstantStorage pushConstants;
 			const Constants *constants;
-			uint8_t* workgroupMemory;
 		};
 
 		SpirvRoutine routine;
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 020309e..ae720a2 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 #include "SpirvShader.hpp"
-
 #include "SamplerCore.hpp"
+
+#include "Reactor/Coroutine.hpp"
 #include "System/Math.hpp"
 #include "Vulkan/VkBuffer.hpp"
 #include "Vulkan/VkBufferView.hpp"
@@ -884,6 +885,10 @@
 				// Don't need to do anything during analysis pass
 				break;
 
+			case spv::OpControlBarrier:
+				modes.ContainsControlBarriers = true;
+				break;
+
 			case spv::OpExtension:
 			{
 				auto ext = reinterpret_cast<char const *>(insn.wordPointer(1));
@@ -2462,6 +2467,9 @@
 		case spv::OpCopyMemory:
 			return EmitCopyMemory(insn, state);
 
+		case spv::OpControlBarrier:
+			return EmitControlBarrier(insn, state);
+
 		case spv::OpMemoryBarrier:
 			return EmitMemoryBarrier(insn, state);
 
@@ -4889,6 +4897,11 @@
 		return ptr;
 	}
 
+	void SpirvShader::Yield(YieldResult res) const
+	{
+		rr::Yield(RValue<Int>(int(res)));
+	}
+
 	SpirvShader::EmitResult SpirvShader::EmitImageRead(InsnIterator insn, EmitState *state) const
 	{
 		auto &resultType = getType(Type::ID(insn.word(1)));
@@ -5468,6 +5481,29 @@
 		return EmitResult::Continue;
 	}
 
+	SpirvShader::EmitResult SpirvShader::EmitControlBarrier(InsnIterator insn, EmitState *state) const
+	{
+		auto executionScope = spv::Scope(GetConstScalarInt(insn.word(1)));
+		auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(3)));
+		// TODO: We probably want to consider the memory scope here. For now,
+		// just always emit the full fence.
+		Fence(semantics);
+
+		switch (executionScope)
+		{
+		case spv::ScopeWorkgroup:
+		case spv::ScopeSubgroup:
+			Yield(YieldResult::ControlBarrier);
+			break;
+		default:
+			// See Vulkan 1.1 spec, Appendix A, Validation Rules within a Module.
+			UNREACHABLE("Scope for execution must be limited to Workgroup or Subgroup");
+			break;
+		}
+
+		return EmitResult::Continue;
+	}
+
 	SpirvShader::EmitResult SpirvShader::EmitMemoryBarrier(InsnIterator insn, EmitState *state) const
 	{
 		auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(2)));
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index cb37cce..1eaa6f8 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -262,6 +262,11 @@
 		using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
 		using GetImageSampler = ImageSampler*(const vk::ImageView *imageView, const vk::Sampler *sampler);
 
+		enum class YieldResult
+		{
+			ControlBarrier,
+		};
+
 		/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
 		class InsnIterator
 		{
@@ -543,6 +548,7 @@
 			bool DepthLess : 1;
 			bool DepthUnchanged : 1;
 			bool ContainsKill : 1;
+			bool ContainsControlBarriers : 1;
 			bool NeedsCentroid : 1;
 
 			// Compute workgroup dimensions
@@ -934,6 +940,7 @@
 		EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
 		EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
 		EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
+		EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
 		EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
 		EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
 
@@ -944,6 +951,9 @@
 		// Emits a rr::Fence for the given MemorySemanticsMask.
 		void Fence(spv::MemorySemanticsMask semantics) const;
 
+		// Helper for calling rr::Yield with res cast to an rr::Int.
+		void Yield(YieldResult res) const;
+
 		// OpcodeName() returns the name of the opcode op.
 		// If NDEBUG is defined, then OpcodeName() will only return the numerical code.
 		static std::string OpcodeName(spv::Op op);