Pipeline: Use Yarn to make compute multi-threaded.

Bug: b/139142453
Change-Id: I466b7c935db03104cb4df90735fafe10905bef9e
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35568
Tested-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index b592f0c..a209339 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -18,6 +18,10 @@
 #include "Vulkan/VkDebug.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
 
+#include "Yarn/Defer.hpp"
+#include "Yarn/Trace.hpp"
+#include "Yarn/WaitGroup.hpp"
+
 #include <queue>
 
 namespace
@@ -40,6 +44,8 @@
 
 	void ComputeProgram::generate()
 	{
+		YARN_SCOPED_EVENT("ComputeProgram::generate");
+
 		SpirvRoutine routine(pipelineLayout);
 		shader->emitProlog(&routine);
 		emit(&routine);
@@ -199,11 +205,6 @@
 		auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
 		auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
 
-		// We're sharing a buffer here across all workgroups.
-		// We can only do this because we know a single workgroup is in flight
-		// at any time.
-		std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
-
 		Data data;
 		data.descriptorSets = descriptorSets;
 		data.descriptorDynamicOffsets = descriptorDynamicOffsets;
@@ -221,14 +222,33 @@
 		data.pushConstants = pushConstants;
 		data.constants = &sw::constants;
 
-		for (uint32_t groupZ = baseGroupZ; groupZ < baseGroupZ + groupCountZ; groupZ++)
-		{
-			for (uint32_t groupY = baseGroupY; groupY < baseGroupY + groupCountY; groupY++)
-			{
-				for (uint32_t groupX = baseGroupX; groupX < baseGroupX + groupCountX; groupX++)
-				{
+		yarn::WaitGroup wg;
+		const uint32_t batchCount = 16;
 
-					// TODO(bclayton): Split work across threads.
+		auto groupCount = groupCountX * groupCountY * groupCountZ;
+
+		for (uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
+		{
+			wg.add(1);
+			yarn::schedule([=, &data]
+			{
+				defer(wg.done());
+				std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
+
+				for (uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
+				{
+					auto modulo = groupIndex;
+					auto groupOffsetZ = modulo / (groupCountX * groupCountY);
+					modulo -= groupOffsetZ * (groupCountX * groupCountY);
+					auto groupOffsetY = modulo / groupCountX;
+					modulo -= groupOffsetY * groupCountX;
+					auto groupOffsetX = modulo;
+
+					auto groupZ = baseGroupZ + groupOffsetZ;
+					auto groupY = baseGroupY + groupOffsetY;
+					auto groupX = baseGroupX + groupOffsetX;
+					YARN_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
+
 					using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
 					std::queue<Coroutine> coroutines;
 
@@ -261,10 +281,11 @@
 							coroutines.push(std::move(coroutine));
 						}
 					}
+				}
+			});
+		}
 
-				} // groupX
-			} // groupY
-		} // groupZ
+		wg.wait();
 	}
 
 } // namespace sw