LLVMReactor: Drop codegenMutex, now thread safe.

LLVMReactor used to have a Big Fat Global Mutex over the entire
lifetime of the Nucleus object. This was required as LLVMReactor
used global variables for storing builder state.

Over the past year, there has been significant code cleanup and
global state has been reduced to a couple of globals that can now
be marked thread_local. With all state now being immutable global
or thread local, we are now able to remove the mutex.

ASAN and TSAN checks for our unittests are clean.

Bug: b/153803432
Change-Id: Ibe4019fb783f86e734387db431539e915369b488
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33484
Tested-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/Coroutine.hpp b/src/Reactor/Coroutine.hpp
index d70ecf0..cd8763c 100644
--- a/src/Reactor/Coroutine.hpp
+++ b/src/Reactor/Coroutine.hpp
@@ -135,12 +135,10 @@
 	// executable code. After calling, no more reactor functions may be
 	// called without building a new rr::Function or rr::Coroutine.
 	// While automatically called by operator(), finalize() should be called
-	// as early as possible to release the global Reactor mutex lock.
-	// It must also be called explicitly on the same thread that instantiates
-	// the Coroutine instance if operator() is invoked on separate threads.
-	// This is because presently, Reactor backends use a global mutex scoped
-	// to the generation of routines, and these must be locked/unlocked on the
-	// same thread.
+	// as soon as possible once the coroutine has been fully built.
+	// finalize() *must* be called explicitly on the same thread that
+	// instantiates the Coroutine instance if operator() is to be invoked on
+	// different threads.
 	inline void finalize(const Config::Edit &cfg = Config::Edit::None);
 
 	// Starts execution of the coroutine and returns a unique_ptr to a
diff --git a/src/Reactor/LLVMJIT.cpp b/src/Reactor/LLVMJIT.cpp
index bbd91e3..68153a7 100644
--- a/src/Reactor/LLVMJIT.cpp
+++ b/src/Reactor/LLVMJIT.cpp
@@ -57,6 +57,7 @@
     __pragma(warning(pop))
 #endif
 
+#include <atomic>
 #include <unordered_map>
 
 #if defined(_WIN64)
@@ -610,7 +611,7 @@
 		for(size_t i = 0; i < count; i++)
 		{
 			auto func = funcs[i];
-			static size_t numEmittedFunctions = 0;
+			static std::atomic<size_t> numEmittedFunctions = { 0 };
 			std::string name = "f" + llvm::Twine(numEmittedFunctions++).str();
 			func->setName(name);
 			func->setLinkage(llvm::GlobalValue::ExternalLinkage);
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 6cce3f2..48861e9 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -59,10 +59,13 @@
 }
 #endif
 
+#if !LLVM_ENABLE_THREADS
+#	error "LLVM_ENABLE_THREADS needs to be enabled"
+#endif
+
 namespace {
 
-std::unique_ptr<rr::JITBuilder> jit;
-std::mutex codegenMutex;
+thread_local std::unique_ptr<rr::JITBuilder> jit;
 
 // Default configuration settings. Must be accessed under mutex lock.
 std::mutex defaultConfigLock;
@@ -599,8 +602,6 @@
 
 Nucleus::Nucleus()
 {
-	::codegenMutex.lock();  // Reactor and LLVM are currently not thread safe
-
 	ASSERT(jit == nullptr);
 	jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
 }
@@ -608,7 +609,6 @@
 Nucleus::~Nucleus()
 {
 	jit.reset();
-	::codegenMutex.unlock();
 }
 
 void Nucleus::setDefaultConfig(const Config &cfg)
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index dff1a29..2bf5324 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -63,7 +63,7 @@
 }
 
 // Set of variables that do not have a stack location yet.
-std::unordered_set<Variable *> Variable::unmaterializedVariables;
+thread_local std::unordered_set<Variable *> Variable::unmaterializedVariables;
 
 Variable::Variable(Type *type, int arraySize)
     : arraySize(arraySize)
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 488c0be..33e8b44 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -133,7 +133,7 @@
 	static void materializeAll();
 	static void killUnmaterialized();
 
-	static std::unordered_set<Variable *> unmaterializedVariables;
+	static thread_local std::unordered_set<Variable *> unmaterializedVariables;
 
 	Type *const type;
 	mutable Value *rvalue = nullptr;
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index d5aafa9..adb0cb4 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -2949,6 +2949,114 @@
 	EXPECT_EQ(result, value);
 }
 
+TEST(ReactorUnitTests, Multithreaded_Function)
+{
+	constexpr int numThreads = 32;
+	constexpr int numLoops = 64;
+
+	auto threads = std::unique_ptr<std::thread[]>(new std::thread[numThreads]);
+	auto results = std::unique_ptr<int[]>(new int[numThreads * numLoops]);
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		auto threadFunc = [&](int t) {
+			for(int l = 0; l < numLoops; l++)
+			{
+				FunctionT<int(int, int)> function;
+				{
+					Int a = function.Arg<0>();
+					Int b = function.Arg<1>();
+					Return((a << 16) | b);
+				}
+
+				auto f = function("thread%d_loop%d", t, l);
+				results[t * numLoops + l] = f(t, l);
+			}
+		};
+		threads[t] = std::thread(threadFunc, t);
+	}
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		threads[t].join();
+	}
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		for(int l = 0; l < numLoops; l++)
+		{
+			auto expect = (t << 16) | l;
+			auto result = results[t * numLoops + l];
+			EXPECT_EQ(result, expect);
+		}
+	}
+}
+
+TEST(ReactorUnitTests, Multithreaded_Coroutine)
+{
+	if(!rr::Caps.CoroutinesSupported)
+	{
+		SUCCEED() << "Coroutines not supported";
+		return;
+	}
+
+	constexpr int numThreads = 32;
+	constexpr int numLoops = 64;
+
+	struct Result
+	{
+		bool yieldReturns[3];
+		int yieldValues[3];
+	};
+
+	auto threads = std::unique_ptr<std::thread[]>(new std::thread[numThreads]);
+	auto results = std::unique_ptr<Result[]>(new Result[numThreads * numLoops]);
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		auto threadFunc = [&](int t) {
+			for(int l = 0; l < numLoops; l++)
+			{
+				Coroutine<int(int, int)> function;
+				{
+					Int a = function.Arg<0>();
+					Int b = function.Arg<1>();
+					Yield(a);
+					Yield(b);
+				}
+
+				auto coroutine = function(t, l);
+
+				auto &result = results[t * numLoops + l];
+				result = {};
+				result.yieldReturns[0] = coroutine->await(result.yieldValues[0]);
+				result.yieldReturns[1] = coroutine->await(result.yieldValues[1]);
+				result.yieldReturns[2] = coroutine->await(result.yieldValues[2]);
+			}
+		};
+		threads[t] = std::thread(threadFunc, t);
+	}
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		threads[t].join();
+	}
+
+	for(int t = 0; t < numThreads; t++)
+	{
+		for(int l = 0; l < numLoops; l++)
+		{
+			auto const &result = results[t * numLoops + l];
+			EXPECT_EQ(result.yieldReturns[0], true);
+			EXPECT_EQ(result.yieldValues[0], t);
+			EXPECT_EQ(result.yieldReturns[1], true);
+			EXPECT_EQ(result.yieldValues[1], l);
+			EXPECT_EQ(result.yieldReturns[2], false);
+			EXPECT_EQ(result.yieldValues[2], 0);
+		}
+	}
+}
+
 int main(int argc, char **argv)
 {
 	::testing::InitGoogleTest(&argc, argv);