Reactor: Add Gather and Scatter instructions.

Use these for a fast-path for Load() and Store().

This is an attempt to fix the severe performance hit we incurred with robustness.

Bug: b/131224163
Change-Id: I3e244bed5ed723cf29538ff022781c813caaa5eb
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/29454
Tested-by: Ben Clayton <bclayton@google.com>
Presubmit-Ready: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index bd51839..810babc 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -239,37 +239,44 @@
 		T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 		{
 			using EL = typename Element<T>::type;
-			T out;
 			auto offsets = ptr.offsets();
 			mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads.
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(ptr.hasEqualOffsets() && !anyLanesDisabled)
+			if (!atomic && order == std::memory_order_relaxed)
 			{
-				// Load one, replicate.
-				auto offset = Extract(offsets, 0);
-				out = T(Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order));
+				return rr::Gather(rr::Pointer<EL>(ptr.base), offsets, mask, sizeof(float));
 			}
-			Else If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
+			else
 			{
-				// Load all elements in a single SIMD instruction.
-				auto offset = Extract(offsets, 0);
-				out = Load(rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
-			}
-			Else
-			{
-				// Divergent offsets or masked lanes - load each element individually.
-				out = T(0);
-				for (int i = 0; i < SIMD::Width; i++)
+				T out;
+				auto anyLanesDisabled = AnyFalse(mask);
+				If(ptr.hasEqualOffsets() && !anyLanesDisabled)
 				{
-					If(Extract(mask, i) != 0)
+					// Load one, replicate.
+					auto offset = Extract(offsets, 0);
+					out = T(rr::Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order));
+				}
+				Else If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
+				{
+					// Load all elements in a single SIMD instruction.
+					auto offset = Extract(offsets, 0);
+					out = rr::Load(rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
+				}
+				Else
+				{
+					// Divergent offsets or masked lanes.
+					out = T(0);
+					for (int i = 0; i < SIMD::Width; i++)
 					{
-						auto offset = Extract(offsets, i);
-						auto el = rr::Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
-						out = Insert(out, el, i);
+						If(Extract(mask, i) != 0)
+						{
+							auto offset = Extract(offsets, i);
+							auto el = rr::Load(rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
+							out = Insert(out, el, i);
+						}
 					}
 				}
+				return out;
 			}
-			return out;
 		}
 
 		template<typename T>
@@ -278,22 +285,29 @@
 			using EL = typename Element<T>::type;
 			auto offsets = ptr.offsets();
 			mask &= CmpLT(offsets + SIMD::Int(sizeof(float) - 1), SIMD::Int(ptr.limit)); // Disable OOB reads.
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
+			if (!atomic && order == std::memory_order_relaxed)
 			{
-				// Store all elements in a single SIMD instruction.
-				auto offset = Extract(offsets, 0);
-				Store(val, rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
+				return rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, sizeof(float));
 			}
-			Else
+			else
 			{
-				// Divergent offsets or masked lanes.
-				for (int i = 0; i < SIMD::Width; i++)
+				auto anyLanesDisabled = AnyFalse(mask);
+				If(ptr.hasSequentialOffsets() && !anyLanesDisabled)
 				{
-					If(Extract(mask, i) != 0)
+					// Store all elements in a single SIMD instruction.
+					auto offset = Extract(offsets, 0);
+					Store(val, rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
+				}
+				Else
+				{
+					// Divergent offsets or masked lanes.
+					for (int i = 0; i < SIMD::Width; i++)
 					{
-						auto offset = Extract(offsets, i);
-						rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
+						If(Extract(mask, i) != 0)
+						{
+							auto offset = Extract(offsets, i);
+							rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
+						}
 					}
 				}
 			}
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index a4e820d..643ef05 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -882,7 +882,17 @@
 		#error "unknown architecture"
 		#endif
 
-		llvm::SmallVector<std::string, 1> mattrs;
+		llvm::SmallVector<std::string, 8> mattrs;
+
+		llvm::StringMap<bool> features;
+		bool ok = llvm::sys::getHostCPUFeatures(features);
+		ASSERT_MSG(ok, "llvm::sys::getHostCPUFeatures returned false");
+		for (auto &feature : features)
+		{
+			if (feature.second) { mattrs.push_back(feature.first()); }
+		}
+
+#if 0
 #if defined(__i386__) || defined(__x86_64__)
 		mattrs.push_back(CPUID::supportsMMX()    ? "+mmx"    : "-mmx");
 		mattrs.push_back(CPUID::supportsCMOV()   ? "+cmov"   : "-cmov");
@@ -899,6 +909,7 @@
 		// might fail to link.
 #endif
 #endif
+#endif
 
 		llvm::TargetOptions targetOpts;
 		targetOpts.UnsafeFPMath = false;
@@ -1299,6 +1310,55 @@
 		}
 	}
 
+	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment)
+	{
+		ASSERT(V(base)->getType()->isPointerTy());
+		ASSERT(V(offsets)->getType()->isVectorTy());
+		ASSERT(V(mask)->getType()->isVectorTy());
+
+		auto numEls = V(mask)->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
+		auto i8Ty = ::llvm::Type::getInt8Ty(*::context);
+		auto i8PtrTy = i8Ty->getPointerTo();
+		auto elPtrTy = T(elTy)->getPointerTo();
+		auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
+		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+		auto i8Base = ::builder->CreatePointerCast(V(base), i8PtrTy);
+		auto i8Ptrs = ::builder->CreateGEP(i8Base, V(offsets));
+		auto elPtrs = ::builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+		auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto passthrough = ::llvm::Constant::getNullValue(elVecTy);
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy } );
+		return V(::builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough }));
+	}
+
+	void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment)
+	{
+		ASSERT(V(base)->getType()->isPointerTy());
+		ASSERT(V(val)->getType()->isVectorTy());
+		ASSERT(V(offsets)->getType()->isVectorTy());
+		ASSERT(V(mask)->getType()->isVectorTy());
+
+		auto numEls = V(mask)->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
+		auto i8Ty = ::llvm::Type::getInt8Ty(*::context);
+		auto i8PtrTy = i8Ty->getPointerTo();
+		auto elVecTy = V(val)->getType();
+		auto elTy = elVecTy->getVectorElementType();
+		auto elPtrTy = elTy->getPointerTo();
+		auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
+		auto i8Base = ::builder->CreatePointerCast(V(base), i8PtrTy);
+		auto i8Ptrs = ::builder->CreateGEP(i8Base, V(offsets));
+		auto elPtrs = ::builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
+		auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy } );
+		::builder->CreateCall(func, { V(val), elPtrs, align, i8Mask });
+	}
+
 	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
 	{
 		RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index a524595..cd24be6 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -100,6 +100,10 @@
 		static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
 		static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
 
+		// Scatter / Gather instructions
+		static Value *createGather(Value *base, Type *elementType, Value *offsets, Value *mask, unsigned int alignment);
+		static void createScatter(Value *base, Value *value, Value *offsets, Value *mask, unsigned int alignment);
+
 		// Atomic instructions
 		static Value *createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
 		static Value *createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder = std::memory_order_relaxed);
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index b2c99e2..faaf679 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4212,4 +4212,25 @@
 		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
 		Nucleus::setInsertBlock(bodyBB);
 	}
+
+	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return RValue<Float4>(Nucleus::createGather(base.value, Float::getType(), offsets.value, mask.value, alignment));
+	}
+
+	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		return RValue<Int4>(Nucleus::createGather(base.value, Int::getType(), offsets.value, mask.value, alignment));
+	}
+
+	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
+	}
+
+	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+	{
+		Nucleus::createScatter(base.value, val.value, offsets.value, mask.value, alignment);
+	}
+
 }
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 1c7b5ca..25682ab 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2355,6 +2355,12 @@
 		return Load(RValue<Pointer<T>>(pointer), alignment, atomic, memoryOrder);
 	}
 
+	// TODO: Use SIMD to template these.
+	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+	void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+
 	template<typename T>
 	void Store(RValue<T> value, RValue<Pointer<T>> pointer, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
 	{
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 86f0c00..c5fa7c9 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3453,6 +3453,8 @@
 
 	// Below are functions currently unimplemented for the Subzero backend.
 	// They are stubbed to satisfy the linker.
+	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createGather()"); return nullptr; }
+	void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createScatter()"); }
 	RValue<Float4> Sin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sin()"); return Float4(0); }
 	RValue<Float4> Cos(RValue<Float4> x) { UNIMPLEMENTED("Subzero Cos()"); return Float4(0); }
 	RValue<Float4> Tan(RValue<Float4> x) { UNIMPLEMENTED("Subzero Tan()"); return Float4(0); }