Use atomic operations to specify shared memory access order

TSAN detected many data race errors in the SwiftShader Renderer
class. x86 has a strong memory ordering model which guarantees
that changes are observed in the same order by other threads.
However, C++ does not provide such guarantees unless specified
using atomic operations. In order to fix these, a new AtomicInt
class was added which is a basically a wrapper class for
std::atomic<int> and which only exposes the portion of the API
required by SwiftShader.

Since std::atomic isn't available on older versions of Android,
a fallback class was implemented without using std::atomic, which
is closer to the previous implementation. Both classes appear to
work properly after performing a few dEQP tests. Both also perform
similarly.

A few minor changes were made in order to attempt to reduce the use
of atomic integer operations when possible.

Change-Id: Ife6d3a2b6113346f8f8163b692e79c2a0e03b22f
Reviewed-on: https://swiftshader-review.googlesource.com/12308
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Common/Thread.hpp b/src/Common/Thread.hpp
index 87b90d1..7b4e0ca 100644
--- a/src/Common/Thread.hpp
+++ b/src/Common/Thread.hpp
@@ -30,6 +30,17 @@
 
 #include <stdlib.h>
 
+#if defined(__clang__)
+#define USE_STD_ATOMIC __has_include(<atomic>) // clang has an explicit check for the availability of atomic
+// atomic is available in C++11 or newer, and in Visual Studio 2012 or newer
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1700)) || (__cplusplus >= 201103L)
+#define USE_STD_ATOMIC 1
+#endif
+
+#if USE_STD_ATOMIC
+#include <atomic>
+#endif
+
 namespace sw
 {
 	class Event;
@@ -265,7 +276,7 @@
 
 	inline int atomicAdd(volatile int* target, int value)
 	{
-		#if defined(_MSC_VER)
+		#if defined(_WIN32)
 			return InterlockedExchangeAdd((volatile long*)target, value) + value;
 		#else
 			return __sync_add_and_fetch(target, value);
@@ -280,6 +291,46 @@
 			__asm__ __volatile__ ("nop");
 		#endif
 	}
+
+	#if USE_STD_ATOMIC
+		class AtomicInt
+		{
+		public:
+			AtomicInt() : ai() {}
+			AtomicInt(int i) : ai(i) {}
+
+			inline operator int() const { return ai.load(std::memory_order_acquire); }
+			inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
+			inline void operator=(int i) { ai.store(i, std::memory_order_release); }
+			inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
+			inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
+			inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
+			inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
+			inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
+			inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
+		private:
+			std::atomic<int> ai;
+		};
+	#else
+		class AtomicInt
+		{
+		public:
+			AtomicInt() {}
+			AtomicInt(int i) : vi(i) {}
+
+			inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
+			inline void operator=(const AtomicInt& i) { sw::atomicExchange(&vi, i.vi); }
+			inline void operator=(int i) { sw::atomicExchange(&vi, i); }
+			inline void operator--() { sw::atomicDecrement(&vi); }
+			inline void operator++() { sw::atomicIncrement(&vi); }
+			inline int operator--(int) { return sw::atomicDecrement(&vi); }
+			inline int operator++(int) { return sw::atomicIncrement(&vi); }
+			inline void operator-=(int i) { sw::atomicAdd(&vi, -i); }
+			inline void operator+=(int i) { sw::atomicAdd(&vi, i); }
+		private:
+			volatile int vi;
+		};
+	#endif
 }
 
 #endif   // sw_Thread_hpp
diff --git a/src/Renderer/Clipper.cpp b/src/Renderer/Clipper.cpp
index cfd859f..cf3fe3c 100644
--- a/src/Renderer/Clipper.cpp
+++ b/src/Renderer/Clipper.cpp
@@ -60,20 +60,21 @@
 
 		if(clipFlagsOr & CLIP_USER)
 		{
+			int clipFlags = draw.clipFlags;
 			DrawData &data = *draw.data;
 
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE0) clipPlane(polygon, data.clipPlane[0]);
+			if(clipFlags & CLIP_PLANE0) clipPlane(polygon, data.clipPlane[0]);
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE1) clipPlane(polygon, data.clipPlane[1]);
+			if(clipFlags & CLIP_PLANE1) clipPlane(polygon, data.clipPlane[1]);
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE2) clipPlane(polygon, data.clipPlane[2]);
+			if(clipFlags & CLIP_PLANE2) clipPlane(polygon, data.clipPlane[2]);
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE3) clipPlane(polygon, data.clipPlane[3]);
+			if(clipFlags & CLIP_PLANE3) clipPlane(polygon, data.clipPlane[3]);
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE4) clipPlane(polygon, data.clipPlane[4]);
+			if(clipFlags & CLIP_PLANE4) clipPlane(polygon, data.clipPlane[4]);
 			if(polygon.n >= 3) {
-			if(draw.clipFlags & CLIP_PLANE5) clipPlane(polygon, data.clipPlane[5]);
+			if(clipFlags & CLIP_PLANE5) clipPlane(polygon, data.clipPlane[5]);
 			}}}}}}
 		}
 
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index 0869697..e918c43 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -314,7 +314,7 @@
 					Query* q = *query;
 					if(includePrimitivesWrittenQueries || (q->type != Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN))
 					{
-						atomicIncrement(&(q->reference));
+						++q->reference; // Atomic
 						draw->queries->push_back(q);
 					}
 				}
@@ -645,7 +645,7 @@
 			draw->references = (count + batch - 1) / batch;
 
 			schedulerMutex.lock();
-			nextDraw++;
+			++nextDraw; // Atomic
 			schedulerMutex.unlock();
 
 			#ifndef NDEBUG
@@ -771,9 +771,12 @@
 		{
 			DrawCall *draw = drawList[currentDraw % DRAW_COUNT];
 
-			if(draw->primitive >= draw->count)
+			int primitive = draw->primitive;
+			int count = draw->count;
+
+			if(primitive >= count)
 			{
-				currentDraw++;
+				++currentDraw; // Atomic
 
 				if(currentDraw == nextDraw)
 				{
@@ -785,8 +788,8 @@
 
 			if(!primitiveProgress[unit].references)   // Task not already being executed and not still in use by a pixel unit
 			{
-				int primitive = draw->primitive;
-				int count = draw->count;
+				primitive = draw->primitive;
+				count = draw->count;
 				int batch = draw->batchSize;
 
 				primitiveProgress[unit].drawCall = currentDraw;
@@ -812,7 +815,9 @@
 	{
 		schedulerMutex.lock();
 
-		if((int)qSize < threadCount - threadsAwake + 1)
+		int curThreadsAwake = threadsAwake;
+
+		if((int)qSize < threadCount - curThreadsAwake + 1)
 		{
 			findAvailableTasks();
 		}
@@ -822,9 +827,9 @@
 			task[threadIndex] = taskQueue[(qHead - qSize) % 32];
 			qSize--;
 
-			if(threadsAwake != threadCount)
+			if(curThreadsAwake != threadCount)
 			{
-				int wakeup = qSize - threadsAwake + 1;
+				int wakeup = qSize - curThreadsAwake + 1;
 
 				for(int i = 0; i < threadCount && wakeup > 0; i++)
 				{
@@ -834,7 +839,7 @@
 						task[i].type = Task::RESUME;
 						resume[i]->signal();
 
-						threadsAwake++;
+						++threadsAwake; // Atomic
 						wakeup--;
 					}
 				}
@@ -844,7 +849,7 @@
 		{
 			task[threadIndex].type = Task::SUSPEND;
 
-			threadsAwake--;
+			--threadsAwake; // Atomic
 		}
 
 		schedulerMutex.unlock();
@@ -943,15 +948,15 @@
 
 		if(pixelProgress[cluster].processedPrimitives >= draw.count)
 		{
-			pixelProgress[cluster].drawCall++;
+			++pixelProgress[cluster].drawCall; // Atomic
 			pixelProgress[cluster].processedPrimitives = 0;
 		}
 
-		int ref = atomicDecrement(&primitiveProgress[unit].references);
+		int ref = primitiveProgress[unit].references--; // Atomic
 
 		if(ref == 0)
 		{
-			ref = atomicDecrement(&draw.references);
+			ref = draw.references--; // Atomic
 
 			if(ref == 0)
 			{
@@ -976,17 +981,17 @@
 						case Query::FRAGMENTS_PASSED:
 							for(int cluster = 0; cluster < clusterCount; cluster++)
 							{
-								atomicAdd((volatile int*)&query->data, data.occlusion[cluster]);
+								query->data += data.occlusion[cluster];
 							}
 							break;
 						case Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-							atomicAdd((volatile int*)&query->data, processedPrimitives);
+							query->data += processedPrimitives;
 							break;
 						default:
 							break;
 						}
 
-						atomicDecrement(&query->reference);
+						--query->reference; // Atomic
 					}
 
 					delete draw.queries;
@@ -1069,17 +1074,18 @@
 	void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
 	{
 		Triangle *triangle = triangleBatch[unit];
-		DrawCall *draw = drawList[primitiveProgress[unit].drawCall % DRAW_COUNT];
+		int primitiveDrawCall = primitiveProgress[unit].drawCall;
+		DrawCall *draw = drawList[primitiveDrawCall % DRAW_COUNT];
 		DrawData *data = draw->data;
 		VertexTask *task = vertexTask[thread];
 
 		const void *indices = data->indices;
 		VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
 
-		if(task->vertexCache.drawCall != primitiveProgress[unit].drawCall)
+		if(task->vertexCache.drawCall != primitiveDrawCall)
 		{
 			task->vertexCache.clear();
-			task->vertexCache.drawCall = primitiveProgress[unit].drawCall;
+			task->vertexCache.drawCall = primitiveDrawCall;
 		}
 
 		unsigned int batch[128][3];   // FIXME: Adjust to dynamic batch size
diff --git a/src/Renderer/Renderer.hpp b/src/Renderer/Renderer.hpp
index e33f828..4576d23 100644
--- a/src/Renderer/Renderer.hpp
+++ b/src/Renderer/Renderer.hpp
@@ -110,8 +110,8 @@
 		}
 
 		bool building;
-		volatile int reference;
-		volatile unsigned int data;
+		AtomicInt reference;
+		AtomicInt data;
 
 		const Type type;
 	};
@@ -213,8 +213,8 @@
 
 		~DrawCall();
 
-		DrawType drawType;
-		int batchSize;
+		AtomicInt drawType;
+		AtomicInt batchSize;
 
 		Routine *vertexRoutine;
 		Routine *setupRoutine;
@@ -247,11 +247,11 @@
 
 		std::list<Query*> *queries;
 
-		int clipFlags;
+		AtomicInt clipFlags;
 
-		volatile int primitive;    // Current primitive to enter pipeline
-		volatile int count;        // Number of primitives to render
-		volatile int references;   // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
+		AtomicInt primitive;    // Current primitive to enter pipeline
+		AtomicInt count;        // Number of primitives to render
+		AtomicInt references;   // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
 
 		DrawData *data;
 	};
@@ -279,9 +279,9 @@
 				SUSPEND
 			};
 
-			volatile Type type;
-			volatile int primitiveUnit;
-			volatile int pixelCluster;
+			AtomicInt type;
+			AtomicInt primitiveUnit;
+			AtomicInt pixelCluster;
 		};
 
 		struct PrimitiveProgress
@@ -295,11 +295,11 @@
 				references = 0;
 			}
 
-			volatile int drawCall;
-			volatile int firstPrimitive;
-			volatile int primitiveCount;
-			volatile int visible;
-			volatile int references;
+			AtomicInt drawCall;
+			AtomicInt firstPrimitive;
+			AtomicInt primitiveCount;
+			AtomicInt visible;
+			AtomicInt references;
 		};
 
 		struct PixelProgress
@@ -311,9 +311,9 @@
 				executing = false;
 			}
 
-			volatile int drawCall;
-			volatile int processedPrimitives;
-			volatile bool executing;
+			AtomicInt drawCall;
+			AtomicInt processedPrimitives;
+			AtomicInt executing;
 		};
 
 	public:
@@ -449,8 +449,8 @@
 		Plane clipPlane[MAX_CLIP_PLANES];   // Tranformed to clip space
 		bool updateClipPlanes;
 
-		volatile bool exitThreads;
-		volatile int threadsAwake;
+		AtomicInt exitThreads;
+		AtomicInt threadsAwake;
 		Thread *worker[16];
 		Event *resume[16];         // Events for resuming threads
 		Event *suspend[16];        // Events for suspending threads
@@ -464,8 +464,8 @@
 		DrawCall *drawCall[DRAW_COUNT];
 		DrawCall *drawList[DRAW_COUNT];
 
-		volatile int currentDraw;
-		volatile int nextDraw;
+		AtomicInt currentDraw;
+		AtomicInt nextDraw;
 
 		Task taskQueue[32];
 		unsigned int qHead;
diff --git a/src/Renderer/Surface.hpp b/src/Renderer/Surface.hpp
index 6418c08..8b3b8c5 100644
--- a/src/Renderer/Surface.hpp
+++ b/src/Renderer/Surface.hpp
@@ -245,7 +245,7 @@
 			int sliceB;
 			int sliceP;
 			Format format;
-			Lock lock;
+			AtomicInt lock;
 
 			bool dirty;
 		};