Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. We'll need the fallbacks in any case. However, once we've decided on how to specify the CPU features of the user machine we can use the nicer LZCNT/TZCNT/POPCNT as well. Adds cmov, bsf, and bsr instructions. Calls a popcount helper function for machines without SSE4.2. Not handling bswap yet (which can also take i16 params). BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882 R=stichnot@chromium.org, wala@chromium.org Review URL: https://codereview.chromium.org/390443005
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp index 38b6fc6..45c3151 100644 --- a/src/IceTargetLoweringX8632.cpp +++ b/src/IceTargetLoweringX8632.cpp
@@ -39,7 +39,7 @@ const struct TableFcmp_ { uint32_t Default; bool SwapOperands; - InstX8632Br::BrCond C1, C2; + InstX8632::BrCond C1, C2; } TableFcmp[] = { #define X(val, dflt, swap, C1, C2) \ { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ @@ -54,7 +54,7 @@ // x86 conditional branch instruction. const struct TableIcmp32_ { - InstX8632Br::BrCond Mapping; + InstX8632::BrCond Mapping; } TableIcmp32[] = { #define X(val, C_32, C1_64, C2_64, C3_64) \ { InstX8632Br::C_32 } \ @@ -69,7 +69,7 @@ // conditional branches are needed. For the other conditions, three separate // conditional branches are needed. const struct TableIcmp64_ { - InstX8632Br::BrCond C1, C2, C3; + InstX8632::BrCond C1, C2, C3; } TableIcmp64[] = { #define X(val, C_32, C1_64, C2_64, C3_64) \ { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ @@ -79,7 +79,7 @@ }; const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); -InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { +InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { size_t Index = static_cast<size_t>(Cond); assert(Index < TableIcmp32Size); return TableIcmp32[Index].Mapping; @@ -2109,12 +2109,61 @@ return; } case Intrinsics::Bswap: - case Intrinsics::Ctlz: - case Intrinsics::Ctpop: - case Intrinsics::Cttz: - // TODO(jvoung): fill it in. Func->setError("Unhandled intrinsic"); return; + case Intrinsics::Ctpop: { + Variable *Dest = Instr->getDest(); + Operand *Val = Instr->getArg(0); + InstCall *Call = makeHelperCall(Val->getType() == IceType_i64 ? + "__popcountdi2" : "__popcountsi2", Dest, 1); + Call->addArg(Val); + lowerCall(Call); + // The popcount helpers always return 32-bit values, while the intrinsic's + // signature matches the native POPCNT instruction and fills a 64-bit reg + // (in 64-bit mode). Thus, clear the upper bits of the dest just in case + // the user doesn't do that in the IR. If the user does that in the IR, + // then this zero'ing instruction is dead and gets optimized out. + if (Val->getType() == IceType_i64) { + Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); + Constant *Zero = Ctx->getConstantZero(IceType_i32); + _mov(DestHi, Zero); + } + return; + } + case Intrinsics::Ctlz: { + // The "is zero undef" parameter is ignored and we always return + // a well-defined value. + Operand *Val = legalize(Instr->getArg(0)); + Operand *FirstVal; + Operand *SecondVal = NULL; + if (Val->getType() == IceType_i64) { + FirstVal = loOperand(Val); + SecondVal = hiOperand(Val); + } else { + FirstVal = Val; + } + const bool IsCttz = false; + lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, + SecondVal); + return; + } + case Intrinsics::Cttz: { + // The "is zero undef" parameter is ignored and we always return + // a well-defined value. + Operand *Val = legalize(Instr->getArg(0)); + Operand *FirstVal; + Operand *SecondVal = NULL; + if (Val->getType() == IceType_i64) { + FirstVal = hiOperand(Val); + SecondVal = loOperand(Val); + } else { + FirstVal = Val; + } + const bool IsCttz = true; + lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, + SecondVal); + return; + } case Intrinsics::Longjmp: { InstCall *Call = makeHelperCall("longjmp", NULL, 2); Call->addArg(Instr->getArg(0)); @@ -2408,6 +2457,81 @@ _mov(Dest, T_eax); } +// Lowers count {trailing, leading} zeros intrinsic. +// +// We could do constant folding here, but that should have +// been done by the front-end/middle-end optimizations. +void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, + Operand *FirstVal, Operand *SecondVal) { + // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI). + // Then the instructions will handle the Val == 0 case much more simply + // and won't require conversion from bit position to number of zeros. + // + // Otherwise: + // bsr IF_NOT_ZERO, Val + // mov T_DEST, 63 + // cmovne T_DEST, IF_NOT_ZERO + // xor T_DEST, 31 + // mov DEST, T_DEST + // + // NOTE: T_DEST must be a register because cmov requires its dest to be a + // register. Also, bsf and bsr require their dest to be a register. + // + // The xor DEST, 31 converts a bit position to # of leading zeroes. + // E.g., for 000... 00001100, bsr will say that the most significant bit + // set is at position 3, while the number of leading zeros is 28. Xor is + // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case). + // + // Similar for 64-bit, but start w/ speculating that the upper 32 bits + // are all zero, and compute the result for that case (checking the lower + // 32 bits). Then actually compute the result for the upper bits and + // cmov in the result from the lower computation if the earlier speculation + // was correct. + // + // Cttz, is similar, but uses bsf instead, and doesn't require the xor + // bit position conversion, and the speculation is reversed. + assert(Ty == IceType_i32 || Ty == IceType_i64); + Variable *T = makeReg(IceType_i32); + if (Cttz) { + _bsf(T, FirstVal); + } else { + _bsr(T, FirstVal); + } + Variable *T_Dest = makeReg(IceType_i32); + Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32); + Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31); + if (Cttz) { + _mov(T_Dest, ThirtyTwo); + } else { + Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63); + _mov(T_Dest, SixtyThree); + } + _cmov(T_Dest, T, InstX8632::Br_ne); + if (!Cttz) { + _xor(T_Dest, ThirtyOne); + } + if (Ty == IceType_i32) { + _mov(Dest, T_Dest); + return; + } + _add(T_Dest, ThirtyTwo); + Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); + Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); + // Will be using "test" on this, so we need a registerized variable. + Variable *SecondVar = legalizeToVar(SecondVal); + Variable *T_Dest2 = makeReg(IceType_i32); + if (Cttz) { + _bsf(T_Dest2, SecondVar); + } else { + _bsr(T_Dest2, SecondVar); + _xor(T_Dest2, ThirtyOne); + } + _test(SecondVar, SecondVar); + _cmov(T_Dest2, T_Dest, InstX8632::Br_e); + _mov(DestLo, T_Dest2); + _mov(DestHi, Ctx->getConstantZero(IceType_i32)); +} + namespace { bool isAdd(const Inst *Inst) {