SpirvShader: Optimize SIMD sequential, fully-in-bounds loads & stores

For sequential, fully-in-bound vectors:
* Loads can safely be a regular vector load. We mask just to keep behavior consistent with rr::MaskedLoad and rr::Gather.
* Stores of non-atomics can be implemented as a read-modify-write.

These optimizations have drastic performance improvements on architectures where there are no masked-read and masked-write instructions.

Bug: b/135609394
Change-Id: I552cc38f4aeae73f8db079a0a11da6a8db857710
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33628
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 6323b05..bef561f 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -290,8 +290,19 @@
 		T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
 		{
 			using EL = typename Element<T>::type;
+
+			if (ptr.hasStaticSequentialOffsets(sizeof(float)) &&
+				ptr.isStaticAllInBounds(sizeof(float)))
+			{
+				// All elements sequential and in bounds.
+				// Perform regular load.
+				auto load = rr::Load(rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0]), alignment, atomic, order);
+				return As<T>(load & mask); // TODO: Mask here should be unnecessary, but keeps with MaskedLoad and Gather.
+			}
+
 			auto offsets = ptr.offsets();
 			mask &= ptr.isInBounds(sizeof(float)); // Disable OOB reads.
+
 			if (!atomic && order == std::memory_order_relaxed)
 			{
 				if (ptr.hasStaticEqualOffsets())
@@ -351,6 +362,7 @@
 		void Store(Pointer ptr, T val, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 		{
 			using EL = typename Element<T>::type;
+			constexpr size_t alignment = sizeof(float);
 			auto offsets = ptr.offsets();
 			mask &= ptr.isInBounds(sizeof(float)); // Disable OOB writes.
 			if (!atomic && order == std::memory_order_relaxed)
@@ -367,16 +379,28 @@
 							Extract(maskedVal, 1) |
 							Extract(maskedVal, 2) |
 							Extract(maskedVal, 3);
-						*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], sizeof(float)) = As<EL>(scalarVal);
+						*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment) = As<EL>(scalarVal);
 					}
-					return;
 				}
-
-				if (ptr.hasStaticSequentialOffsets(sizeof(float)))
+				else if (ptr.hasStaticSequentialOffsets(sizeof(float)))
 				{
-					return rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, sizeof(float));
+					if (ptr.isStaticAllInBounds(sizeof(float)))
+					{
+						// Pointer has no elements OOB, and the store is not atomic.
+						// Perform a RMW.
+						auto p = rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0], alignment);
+						auto prev = *p;
+						*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
+					}
+					else
+					{
+						rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, alignment);
+					}
 				}
-				return rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, sizeof(float));
+				else
+				{
+					rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, alignment);
+				}
 			}
 			else
 			{
@@ -385,7 +409,7 @@
 				{
 					// Store all elements in a single SIMD instruction.
 					auto offset = Extract(offsets, 0);
-					Store(val, rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
+					Store(val, rr::Pointer<T>(&ptr.base[offset]), alignment, atomic, order);
 				}
 				Else
 				{
@@ -395,7 +419,7 @@
 						If(Extract(mask, i) != 0)
 						{
 							auto offset = Extract(offsets, i);
-							rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
+							rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order);
 						}
 					}
 				}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 0ac6116..60078d2 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -142,14 +142,14 @@
 			{
 				ASSERT(accessSize > 0);
 
+				if (isStaticAllInBounds(accessSize))
+				{
+					return SIMD::Int(0xffffffff);
+				}
+
 				if (!hasDynamicOffsets && !hasDynamicLimit)
 				{
 					// Common fast paths.
-					if (hasStaticEqualOffsets())
-					{
-						return SIMD::Int((staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
-					}
-
 					static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
 					return SIMD::Int(
 						(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
@@ -161,6 +161,22 @@
 				return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
 			}
 
+			inline bool isStaticAllInBounds(unsigned int accessSize) const
+			{
+				if (hasDynamicOffsets || hasDynamicLimit)
+				{
+					return false;
+				}
+				for (int i = 0; i < SIMD::Width; i++)
+				{
+					if (staticOffsets[i] + accessSize - 1 >= staticLimit)
+					{
+						return false;
+					}
+				}
+				return true;
+			}
+
 			inline Int limit() const
 			{
 				return dynamicLimit + staticLimit;