SpirvShader: Optimize SIMD sequential, fully-in-bounds loads & stores For sequential, fully-in-bound vectors: * Loads can safely be a regular vector load. We mask just to keep behavior consistent with rr::MaskedLoad and rr::Gather. * Stores of non-atomics can be implemented as a read-modify-write. These optimizations have drastic performance improvements on architectures where there are no masked-read and masked-write instructions. Bug: b/135609394 Change-Id: I552cc38f4aeae73f8db079a0a11da6a8db857710 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33628 Tested-by: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp index 6323b05..bef561f 100644 --- a/src/Pipeline/SpirvShader.cpp +++ b/src/Pipeline/SpirvShader.cpp
@@ -290,8 +290,19 @@ T Load(Pointer ptr, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */) { using EL = typename Element<T>::type; + + if (ptr.hasStaticSequentialOffsets(sizeof(float)) && + ptr.isStaticAllInBounds(sizeof(float))) + { + // All elements sequential and in bounds. + // Perform regular load. + auto load = rr::Load(rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0]), alignment, atomic, order); + return As<T>(load & mask); // TODO: Mask here should be unnecessary, but keeps with MaskedLoad and Gather. + } + auto offsets = ptr.offsets(); mask &= ptr.isInBounds(sizeof(float)); // Disable OOB reads. + if (!atomic && order == std::memory_order_relaxed) { if (ptr.hasStaticEqualOffsets()) @@ -351,6 +362,7 @@ void Store(Pointer ptr, T val, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { using EL = typename Element<T>::type; + constexpr size_t alignment = sizeof(float); auto offsets = ptr.offsets(); mask &= ptr.isInBounds(sizeof(float)); // Disable OOB writes. if (!atomic && order == std::memory_order_relaxed) @@ -367,16 +379,28 @@ Extract(maskedVal, 1) | Extract(maskedVal, 2) | Extract(maskedVal, 3); - *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], sizeof(float)) = As<EL>(scalarVal); + *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment) = As<EL>(scalarVal); } - return; } - - if (ptr.hasStaticSequentialOffsets(sizeof(float))) + else if (ptr.hasStaticSequentialOffsets(sizeof(float))) { - return rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, sizeof(float)); + if (ptr.isStaticAllInBounds(sizeof(float))) + { + // Pointer has no elements OOB, and the store is not atomic. + // Perform a RMW. + auto p = rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0], alignment); + auto prev = *p; + *p = (prev & ~mask) | (As<SIMD::Int>(val) & mask); + } + else + { + rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, alignment); + } } - return rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, sizeof(float)); + else + { + rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, alignment); + } } else { @@ -385,7 +409,7 @@ { // Store all elements in a single SIMD instruction. auto offset = Extract(offsets, 0); - Store(val, rr::Pointer<T>(&ptr.base[offset]), sizeof(float), atomic, order); + Store(val, rr::Pointer<T>(&ptr.base[offset]), alignment, atomic, order); } Else { @@ -395,7 +419,7 @@ If(Extract(mask, i) != 0) { auto offset = Extract(offsets, i); - rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), sizeof(float), atomic, order); + rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order); } } }
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp index 0ac6116..60078d2 100644 --- a/src/Pipeline/SpirvShader.hpp +++ b/src/Pipeline/SpirvShader.hpp
@@ -142,14 +142,14 @@ { ASSERT(accessSize > 0); + if (isStaticAllInBounds(accessSize)) + { + return SIMD::Int(0xffffffff); + } + if (!hasDynamicOffsets && !hasDynamicLimit) { // Common fast paths. - if (hasStaticEqualOffsets()) - { - return SIMD::Int((staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0); - } - static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4"); return SIMD::Int( (staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0, @@ -161,6 +161,22 @@ return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit())); } + inline bool isStaticAllInBounds(unsigned int accessSize) const + { + if (hasDynamicOffsets || hasDynamicLimit) + { + return false; + } + for (int i = 0; i < SIMD::Width; i++) + { + if (staticOffsets[i] + accessSize - 1 >= staticLimit) + { + return false; + } + } + return true; + } + inline Int limit() const { return dynamicLimit + staticLimit;