SpirvShader: Optimize SIMD sequential, fully-in-bounds loads & stores
For sequential, fully-in-bound vectors:
* Loads can safely be a regular vector load. We mask just to keep behavior consistent with rr::MaskedLoad and rr::Gather.
* Stores of non-atomics can be implemented as a read-modify-write.
These optimizations have drastic performance improvements on architectures where there are no masked-read and masked-write instructions.
Bug: b/135609394
Change-Id: I552cc38f4aeae73f8db079a0a11da6a8db857710
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33628
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 6323b05..bef561f 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -290,8 +290,19 @@
T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
{
using EL = typename Element<T>::type;
+
+ if (ptr.hasStaticSequentialOffsets(sizeof(float)) &&
+ ptr.isStaticAllInBounds(sizeof(float)))
+ {
+ // All elements sequential and in bounds.
+ // Perform regular load.
+ auto load = rr::Load(rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0]), alignment, atomic, order);
+ return As<T>(load & mask); // TODO: Mask here should be unnecessary, but keeps with MaskedLoad and Gather.
+ }
+
auto offsets = ptr.offsets();
mask &= ptr.isInBounds(sizeof(float)); // Disable OOB reads.
+
if (!atomic && order == std::memory_order_relaxed)
{
if (ptr.hasStaticEqualOffsets())
@@ -351,6 +362,7 @@
void Store(Pointer ptr, T val, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
{
using EL = typename Element<T>::type;
+ constexpr size_t alignment = sizeof(float);
auto offsets = ptr.offsets();
mask &= ptr.isInBounds(sizeof(float)); // Disable OOB writes.
if (!atomic && order == std::memory_order_relaxed)
@@ -367,16 +379,28 @@
Extract(maskedVal, 1) |
Extract(maskedVal, 2) |
Extract(maskedVal, 3);
- *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], sizeof(float)) = As<EL>(scalarVal);
+ *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment) = As<EL>(scalarVal);
}
- return;
}
-
- if (ptr.hasStaticSequentialOffsets(sizeof(float)))
+ else if (ptr.hasStaticSequentialOffsets(sizeof(float)))
{
- return rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, sizeof(float));
+ if (ptr.isStaticAllInBounds(sizeof(float)))
+ {
+ // Pointer has no elements OOB, and the store is not atomic.
+ // Perform a RMW.
+ auto p = rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0], alignment);
+ auto prev = *p;
+ *p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
+ }
+ else
+ {
+ rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, alignment);
+ }
}
- return rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, sizeof(float));
+ else
+ {
+ rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, alignment);
+ }
}
else
{
@@ -385,7 +409,7 @@
{
// Store all elements in a single SIMD instruction.
auto offset = Extract(offsets, 0);
- Store(val, rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order);
+ Store(val, rr::Pointer<T>(&ptr.base[offset]), alignment, atomic, order);
}
Else
{
@@ -395,7 +419,7 @@
If(Extract(mask, i) != 0)
{
auto offset = Extract(offsets, i);
- rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order);
+ rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order);
}
}
}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 0ac6116..60078d2 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -142,14 +142,14 @@
{
ASSERT(accessSize > 0);
+ if (isStaticAllInBounds(accessSize))
+ {
+ return SIMD::Int(0xffffffff);
+ }
+
if (!hasDynamicOffsets && !hasDynamicLimit)
{
// Common fast paths.
- if (hasStaticEqualOffsets())
- {
- return SIMD::Int((staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
- }
-
static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
return SIMD::Int(
(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
@@ -161,6 +161,22 @@
return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
}
+ inline bool isStaticAllInBounds(unsigned int accessSize) const
+ {
+ if (hasDynamicOffsets || hasDynamicLimit)
+ {
+ return false;
+ }
+ for (int i = 0; i < SIMD::Width; i++)
+ {
+ if (staticOffsets[i] + accessSize - 1 >= staticLimit)
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
inline Int limit() const
{
return dynamicLimit + staticLimit;