SpirvShader: Optimize stores with static equal offsets
This is heavily used in dEQP-VK.ssbo.*. Avoiding generating the scatter
is profitable on all non-AVX512-capable targets;
ScalarizeMaskedMemIntrin is incredibly slow.
Reduces runtime on dEQP-VK.ssbo.layout.random.all_shared_buffer.5 from
24s to 14s on my threadripper (on top of stack of other optimizations).
Bug: b/135609394
Change-Id: I2d6840522a5bd30b4fd532b9c7e2a4712879caa9
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33289
Tested-by: Chris Forbes <chrisforbes@google.com>
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index f5b8bf2..89c49ff 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -355,6 +355,23 @@
mask &= ptr.isInBounds(sizeof(float)); // Disable OOB writes.
if (!atomic && order == std::memory_order_relaxed)
{
+ if (ptr.hasStaticEqualOffsets())
+ {
+ If (AnyTrue(mask))
+ {
+ // All equal. One of these writes will win -- elect the winning lane.
+ auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
+ auto maskedVal = As<SIMD::Int>(val) & elect;
+ auto scalarVal = Extract(maskedVal, 0) |
+ Extract(maskedVal, 1) |
+ Extract(maskedVal, 2) |
+ Extract(maskedVal, 3);
+ *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], sizeof(float)) = As<EL>(scalarVal);
+ }
+ return;
+ }
+
if (ptr.hasStaticSequentialOffsets(sizeof(float)))
{
return rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, sizeof(float));