Reactor (LLVM): Add support for masked loads and stores.

Has wider support than gather / scatter, and is faster.

Bug: b/135609394
Change-Id: Ib1435331f3130fbef7cbf9eaf1c0c2570a0ec2a1
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33169
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 781bc15..0734763 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -1446,6 +1446,40 @@
 		}
 	}
 
+	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment)
+	{
+		ASSERT(V(ptr)->getType()->isPointerTy());
+		ASSERT(V(mask)->getType()->isVectorTy());
+
+		auto numEls = V(mask)->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
+		auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
+		auto elVecPtrTy = elVecTy->getPointerTo();
+		auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto passthrough = ::llvm::Constant::getNullValue(elVecTy);
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy } );
+		return V(::builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
+	}
+
+	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
+	{
+		ASSERT(V(ptr)->getType()->isPointerTy());
+		ASSERT(V(val)->getType()->isVectorTy());
+		ASSERT(V(mask)->getType()->isVectorTy());
+
+		auto numEls = V(mask)->getType()->getVectorNumElements();
+		auto i1Ty = ::llvm::Type::getInt1Ty(*::context);
+		auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
+		auto elVecTy = V(val)->getType();
+		auto elVecPtrTy = elVecTy->getPointerTo();
+		auto i8Mask = ::builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
+		auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
+		auto func = ::llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy } );
+		::builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
+	}
+
 	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment)
 	{
 		ASSERT(V(base)->getType()->isPointerTy());
diff --git a/src/Reactor/Nucleus.hpp b/src/Reactor/Nucleus.hpp
index 4190fd1..3362fb4 100644
--- a/src/Reactor/Nucleus.hpp
+++ b/src/Reactor/Nucleus.hpp
@@ -120,6 +120,10 @@
 		static Value *createStore(Value *value, Value *ptr, Type *type, bool isVolatile = false, unsigned int aligment = 0, bool atomic = false, std::memory_order memoryOrder = std::memory_order_relaxed);
 		static Value *createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex);
 
+		// Masked Load / Store instructions
+		static Value *createMaskedLoad(Value *base, Type *elementType, Value *mask, unsigned int alignment);
+		static void createMaskedStore(Value *base, Value *value, Value *mask, unsigned int alignment);
+
 		// Scatter / Gather instructions
 		static Value *createGather(Value *base, Type *elementType, Value *offsets, Value *mask, unsigned int alignment);
 		static void createScatter(Value *base, Value *value, Value *offsets, Value *mask, unsigned int alignment);
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 516f752..61a5c1f 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4232,6 +4232,26 @@
 		Nucleus::setInsertBlock(bodyBB);
 	}
 
+	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment)
+	{
+		return RValue<Float4>(Nucleus::createMaskedLoad(base.value, Float::getType(), mask.value, alignment));
+	}
+
+	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment)
+	{
+		return RValue<Int4>(Nucleus::createMaskedLoad(base.value, Int::getType(), mask.value, alignment));
+	}
+
+	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment)
+	{
+		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+	}
+
+	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment)
+	{
+		Nucleus::createMaskedStore(base.value, val.value, mask.value, alignment);
+	}
+
 	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
 	{
 		return RValue<Float4>(Nucleus::createGather(base.value, Float::getType(), offsets.value, mask.value, alignment));
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 95cb094..92fd993 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2368,6 +2368,11 @@
 	}
 
 	// TODO: Use SIMD to template these.
+	RValue<Float4> MaskedLoad(RValue<Pointer<Float4>> base, RValue<Int4> mask, unsigned int alignment);
+	RValue<Int4> MaskedLoad(RValue<Pointer<Int4>> base, RValue<Int4> mask, unsigned int alignment);
+	void MaskedStore(RValue<Pointer<Float4>> base, RValue<Float4> val, RValue<Int4> mask, unsigned int alignment);
+	void MaskedStore(RValue<Pointer<Int4>> base, RValue<Int4> val, RValue<Int4> mask, unsigned int alignment);
+
 	RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
 	RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
 	void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 93bc879..4878875 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3468,6 +3468,8 @@
 	// Below are functions currently unimplemented for the Subzero backend.
 	// They are stubbed to satisfy the linker.
 	void Nucleus::createFence(std::memory_order memoryOrder) { UNIMPLEMENTED("Subzero createFence()"); }
+	Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedLoad()"); return nullptr; }
+	void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createMaskedStore()"); }
 	Value *Nucleus::createGather(Value *base, Type *elTy, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createGather()"); return nullptr; }
 	void Nucleus::createScatter(Value *base, Value *val, Value *offsets, Value *mask, unsigned int alignment) { UNIMPLEMENTED("Subzero createScatter()"); }
 	RValue<Float4> Sin(RValue<Float4> x) { UNIMPLEMENTED("Subzero Sin()"); return Float4(0); }