Subzero Coroutines: Use ultra-low-level marl APIs

Instead of using the `marl::Event` synchronization primitives, drop all the way down to using `marl::Scheduler::Fiber` APIs directly, and use the new `marl::Scheduler::Fiber::wait()` function that does not take a lock.

Provides around another 2x performance boost:

```
go run ./third_party/marl/tools/cmd/benchdiff/main.go pre.json post.json
Delta                | Test name                                | (A) pre.json | (B) post.json
-2.08x -81.186µs     | Coroutines/Fibonacci/iterations:512      | 156.202µs    | 75.016µs
-2.10x -668.201µs    | Coroutines/Fibonacci/iterations:4096     | 1.275069ms   | 606.868µs
-2.11x -5.359326ms   | Coroutines/Fibonacci/iterations:32768    | 10.166126ms  | 4.8068ms
-2.12x -10.342µs     | Coroutines/Fibonacci/iterations:64       | 19.585µs     | 9.243µs
-2.13x -2.784542745s | Coroutines/Fibonacci/iterations:16777216 | 5.251299045s | 2.4667563s
-2.13x -347.220746ms | Coroutines/Fibonacci/iterations:2097152  | 653.812928ms | 306.592182ms
-2.14x -43.615678ms  | Coroutines/Fibonacci/iterations:262144   | 82.017312ms  | 38.401634ms
```

Bug: b/145754674
Change-Id: I5f6b0c8c92af645cc2a825c6f1e2769b2440638e
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/42850
Tested-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 3221a7e..522b566 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4481,10 +4481,11 @@
 struct CoroutineData
 {
 	bool useInternalScheduler = false;
-	marl::Event suspended;                                // the coroutine is suspended on a yield()
-	marl::Event resumed;                                  // the caller is suspended on an await()
-	marl::Event done{ marl::Event::Mode::Manual };        // the coroutine should stop at the next yield()
-	marl::Event terminated{ marl::Event::Mode::Manual };  // the coroutine has finished.
+	bool done = false;        // the coroutine should stop at the next yield()
+	bool terminated = false;  // the coroutine has finished.
+	bool inRoutine = false;   // is the coroutine currently executing?
+	marl::Scheduler::Fiber *mainFiber = nullptr;
+	marl::Scheduler::Fiber *routineFiber = nullptr;
 	void *promisePtr = nullptr;
 };
 
@@ -4504,19 +4505,31 @@
 // is called.
 bool suspend(Nucleus::CoroutineHandle handle)
 {
-	auto *data = reinterpret_cast<CoroutineData *>(handle);
-	data->suspended.signal();
-	data->resumed.wait();
-	return !data->done.test();
+	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
+	ASSERT(marl::Scheduler::Fiber::current() == coroData->routineFiber);
+	ASSERT(coroData->inRoutine);
+	coroData->inRoutine = false;
+	coroData->mainFiber->notify();
+	while(!coroData->inRoutine)
+	{
+		coroData->routineFiber->wait();
+	}
+	return !coroData->done;
 }
 
 // resume() is called by await(), blocking until the coroutine calls yield()
 // or the coroutine terminates.
 void resume(Nucleus::CoroutineHandle handle)
 {
-	auto *data = reinterpret_cast<CoroutineData *>(handle);
-	data->resumed.signal();
-	data->suspended.wait();
+	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
+	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
+	ASSERT(!coroData->inRoutine);
+	coroData->inRoutine = true;
+	coroData->routineFiber->notify();
+	while(coroData->inRoutine)
+	{
+		coroData->mainFiber->wait();
+	}
 }
 
 // stop() is called by coroutine_destroy(), signalling that it's done, then blocks
@@ -4524,9 +4537,18 @@
 void stop(Nucleus::CoroutineHandle handle)
 {
 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
-	coroData->done.signal();      // signal that the coroutine should stop at next (or current) yield.
-	coroData->resumed.signal();   // wake the coroutine if blocked on a yield.
-	coroData->terminated.wait();  // wait for the coroutine to return.
+	ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
+	ASSERT(!coroData->inRoutine);
+	if(!coroData->terminated)
+	{
+		coroData->done = true;
+		coroData->inRoutine = true;
+		coroData->routineFiber->notify();
+		while(!coroData->terminated)
+		{
+			coroData->mainFiber->wait();
+		}
+	}
 	if(coroData->useInternalScheduler)
 	{
 		::getOrCreateScheduler().unbind();
@@ -4555,7 +4577,7 @@
 bool isDone(Nucleus::CoroutineHandle handle)
 {
 	auto *coroData = reinterpret_cast<CoroutineData *>(handle);
-	return coroData->done.test();
+	return coroData->done;
 }
 
 void setPromisePtr(Nucleus::CoroutineHandle handle, void *promisePtr)
@@ -4765,15 +4787,29 @@
 		// any fiber switch occurs.
 		coro::setHandleParam(coroData);
 
+		ASSERT(!coroData->routineFiber);
+		coroData->routineFiber = marl::Scheduler::Fiber::current();
+
 		beginFunc();
 
-		coroData->done.signal();        // coroutine is done.
-		coroData->suspended.signal();   // resume any blocking await() call.
-		coroData->terminated.signal();  // signal that the coroutine data is ready for freeing.
+		ASSERT(coroData->inRoutine);
+		coroData->done = true;        // coroutine is done.
+		coroData->terminated = true;  // signal that the coroutine data is ready for freeing.
+		coroData->inRoutine = false;
+		coroData->mainFiber->notify();
 	};
-	marl::schedule(marl::Task(run, marl::Task::Flags::SameThread));
 
-	coroData->suspended.wait();  // block until the first yield or coroutine end
+	ASSERT(!coroData->mainFiber);
+	coroData->mainFiber = marl::Scheduler::Fiber::current();
+
+	// block until the first yield or coroutine end
+	ASSERT(!coroData->inRoutine);
+	coroData->inRoutine = true;
+	marl::schedule(marl::Task(run, marl::Task::Flags::SameThread));
+	while(coroData->inRoutine)
+	{
+		coroData->mainFiber->wait();
+	}
 
 	return coroData;
 }