Emulate rounding to the nearest integer.
This implementation works by adding a large value which makes the
fractional part no longer fit in the mantissa, and then subtracting it
again. It matches nearbyint() for values up to 2^22, positive or
negative.
The 'magic number' of 0x00C00000 is derived by first observing that the
integer values 0x00800000 to 0x00FFFFFF can be represented exactly in
single-precision floating-point format but can't have a fractional part
because there are 24 mantissa bits (the top one being hidden). So when
adding 0x00800000 to for example 0.6, it forces the hardware to round it
to the nearest representable integer, being 0x00800001. Subtracting
0x00800000 again gives us 1.0. This works for rounding any value from
0.0 to 0x007FFFFF. However, it doesn't work for negative values, because
the intermediate result would be less than 0x00800000 and thus leave
some room for fractional bits in the mantissa. The solution is to use
0x00C00000 instead so the range gets split between positive and negative
values.
Note that values greater than the upper bound will still round to
integers, but not the nearest ones, while values less than the lower
bound can result in fractional values.
Bug b/37495485
Change-Id: I1aed2d831269fcf21b8d3313856a9b9756a532ef
Reviewed-on: https://swiftshader-review.googlesource.com/9488
Reviewed-by: Nicolas Capens <capn@google.com>
Reviewed-by: Corentin Wallez <cwallez@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 5222ddb..c1d7cb9 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -77,6 +77,7 @@
class CPUID
{
public:
+ const static bool ARM;
const static bool SSE4_1;
private:
@@ -96,6 +97,17 @@
#endif
}
+ static bool detectARM()
+ {
+ #if defined(__arm__)
+ return true;
+ #elif defined(__i386__) || defined(__x86_64__)
+ return false;
+ #else
+ #error "Unknown architecture"
+ #endif
+ }
+
static bool detectSSE4_1()
{
#if defined(__i386__) || defined(__x86_64__)
@@ -108,7 +120,9 @@
}
};
+ const bool CPUID::ARM = CPUID::detectARM();
const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
+ const bool emulateIntrinsics = CPUID::ARM;
}
namespace sw
@@ -4146,14 +4160,22 @@
RValue<Int> RoundInt(RValue<Float> cast)
{
- Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
- const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
- auto target = ::context->getConstantUndef(Ice::IceType_i32);
- auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
- nearbyint->addArg(cast.value);
- ::basicBlock->appendInst(nearbyint);
+ if(emulateIntrinsics)
+ {
+ // Push the fractional part off the mantissa. Accurate up to +/-2^22.
+ return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
+ }
+ else
+ {
+ Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
+ const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+ auto target = ::context->getConstantUndef(Ice::IceType_i32);
+ auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+ nearbyint->addArg(cast.value);
+ ::basicBlock->appendInst(nearbyint);
- return RValue<Int>(V(result));
+ return RValue<Int>(V(result));
+ }
}
Type *Int::getType()
@@ -5301,14 +5323,22 @@
RValue<Int4> RoundInt(RValue<Float4> cast)
{
- Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
- const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
- auto target = ::context->getConstantUndef(Ice::IceType_i32);
- auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
- nearbyint->addArg(cast.value);
- ::basicBlock->appendInst(nearbyint);
+ if(emulateIntrinsics)
+ {
+ // Push the fractional part off the mantissa. Accurate up to +/-2^22.
+ return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
+ }
+ else
+ {
+ Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+ const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+ auto target = ::context->getConstantUndef(Ice::IceType_i32);
+ auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+ nearbyint->addArg(cast.value);
+ ::basicBlock->appendInst(nearbyint);
- return RValue<Int4>(V(result));
+ return RValue<Int4>(V(result));
+ }
}
RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
@@ -6247,7 +6277,12 @@
RValue<Float4> Round(RValue<Float4> x)
{
- if(CPUID::SSE4_1)
+ if(emulateIntrinsics)
+ {
+ // Push the fractional part off the mantissa. Accurate up to +/-2^22.
+ return (x + Float4(0x00C00000)) - Float4(0x00C00000);
+ }
+ else if(CPUID::SSE4_1)
{
Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};