SprivShader: Replace hand-rolled bitcount with LLVM intrinsic Moved the hand-rolled implementation to Subzero. As we've started exposing bit intrinsics, we might as fix the TODOs. Bug: b/126873455 Tests: dEQP-VK.glsl.builtin.function.integer.bitcount.* Change-Id: Ic37dfd5d73187f2b3afa444abfd9e22439c871b1 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/28792 Presubmit-Ready: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: Ben Clayton <bclayton@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp index 201541d..18f85cf 100644 --- a/src/Pipeline/SpirvShader.cpp +++ b/src/Pipeline/SpirvShader.cpp
@@ -2771,25 +2771,11 @@ break; } case spv::OpBitReverse: - { dst.move(i, BitReverse(src.UInt(i))); break; - } case spv::OpBitCount: - { - // TODO: Add an intrinsic to reactor. Even if there isn't a - // single vector instruction, there may be target-dependent - // ways to make this faster. - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - auto v = src.UInt(i); - SIMD::UInt c = v - ((v >> 1) & SIMD::UInt(0x55555555)); - c = ((c >> 2) & SIMD::UInt(0x33333333)) + (c & SIMD::UInt(0x33333333)); - c = ((c >> 4) + c) & SIMD::UInt(0x0F0F0F0F); - c = ((c >> 8) + c) & SIMD::UInt(0x00FF00FF); - c = ((c >> 16) + c) & SIMD::UInt(0x0000FFFF); - dst.move(i, c); + dst.move(i, BitCount(src.UInt(i))); break; - } case spv::OpSNegate: dst.move(i, -src.Int(i)); break;
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp index de30f2e..ad72d6f 100644 --- a/src/Reactor/LLVMReactor.cpp +++ b/src/Reactor/LLVMReactor.cpp
@@ -3220,6 +3220,12 @@ return RValue<UInt4>(V(::builder->CreateCall(func, { V(v.value) }))); } + RValue<UInt4> BitCount(RValue<UInt4> v) + { + auto func = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::ctpop, { T(UInt4::getType()) } ); + return RValue<UInt4>(V(::builder->CreateCall(func, { V(v.value) }))); + } + RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef) { #if REACTOR_LLVM_VERSION < 7
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp index c119d45..5094d4b 100644 --- a/src/Reactor/Reactor.hpp +++ b/src/Reactor/Reactor.hpp
@@ -2233,6 +2233,7 @@ // Bit Manipulation functions. // TODO: Currentlhy unimplemented for Subzero. RValue<UInt4> BitReverse(RValue<UInt4> x); + RValue<UInt4> BitCount(RValue<UInt4> x); // Count leading zeros. // Returns 32 when: isZeroUndef && x == 0.
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp index c6e16f0..5a2d3a4 100644 --- a/src/Reactor/SubzeroReactor.cpp +++ b/src/Reactor/SubzeroReactor.cpp
@@ -3371,6 +3371,17 @@ return v; } + RValue<UInt4> BitCount(RValue<UInt4> x) + { + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + UInt4 v = x - ((x >> 1) & UInt4(0x55555555)); + v = ((v >> 2) & UInt4(0x33333333)) + (v & UInt4(0x33333333)); + v = ((v >> 4) + v) & UInt4(0x0F0F0F0F); + v = ((v >> 8) + v) & UInt4(0x00FF00FF); + v = ((v >> 16) + v) & UInt4(0x0000FFFF); + return v; + } + Type *Float4::getType() { return T(Ice::IceType_v4f32);