Subzero ARM32: Lower shift and zext, sext, and trunc.
Sext, etc. usually uses shifts (especially for i1 and i64)
so implement shift, then implement those casts.
Implement just enough of bitcast to handle accessing
global addresses (used by some tests). Otherwise,
most other bitcasts are from GPR to FP and FP regs
aren't modeled yet.
Generally following the GCC style for 64-bit shifts.
This takes advantage of the flexible second operand in a "orr",
and takes advantage of the shift-beyond bitwidth saturation.
LLVM is almost the same, but only seems to take advantage
on one side of the 32-bits, not the other side. Should really
get some of the execution tests running to test this behavior!
Fix InstARM32Str::dump(). Str doesn't have a Dest, so use Src.
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/1143323013
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 37f5e19..8f7c331 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -1069,9 +1069,90 @@
_mov(DestLo, T_Lo);
_mov(DestHi, T_Hi);
} break;
- case InstArithmetic::Shl:
+ case InstArithmetic::Shl: {
+ // a=b<<c ==>
+ // GCC 4.8 does:
+ // sub t_c1, c.lo, #32
+ // lsl t_hi, b.hi, c.lo
+ // orr t_hi, t_hi, b.lo, lsl t_c1
+ // rsb t_c2, c.lo, #32
+ // orr t_hi, t_hi, b.lo, lsr t_c2
+ // lsl t_lo, b.lo, c.lo
+ // a.lo = t_lo
+ // a.hi = t_hi
+ // Can be strength-reduced for constant-shifts, but we don't do
+ // that for now.
+ // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative.
+ // On ARM, shifts only take the lower 8 bits of the shift register,
+ // and saturate to the range 0-32, so the negative value will
+ // saturate to 32.
+ Variable *T_Hi = makeReg(IceType_i32);
+ Variable *Src1RLo = legalizeToVar(Src1Lo);
+ Constant *ThirtyTwo = Ctx->getConstantInt32(32);
+ Variable *T_C1 = makeReg(IceType_i32);
+ Variable *T_C2 = makeReg(IceType_i32);
+ _sub(T_C1, Src1RLo, ThirtyTwo);
+ _lsl(T_Hi, Src0RHi, Src1RLo);
+ _orr(T_Hi, T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
+ OperandARM32::LSL, T_C1));
+ _rsb(T_C2, Src1RLo, ThirtyTwo);
+ _orr(T_Hi, T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
+ OperandARM32::LSR, T_C2));
+ _mov(DestHi, T_Hi);
+ Variable *T_Lo = makeReg(IceType_i32);
+ // _mov seems to sometimes have better register preferencing than lsl.
+ // Otherwise mov w/ lsl shifted register is a pseudo-instruction
+ // that maps to lsl.
+ _mov(T_Lo, OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
+ OperandARM32::LSL, Src1RLo));
+ _mov(DestLo, T_Lo);
+ } break;
case InstArithmetic::Lshr:
- case InstArithmetic::Ashr:
+ // a=b>>c (unsigned) ==>
+ // GCC 4.8 does:
+ // rsb t_c1, c.lo, #32
+ // lsr t_lo, b.lo, c.lo
+ // orr t_lo, t_lo, b.hi, lsl t_c1
+ // sub t_c2, c.lo, #32
+ // orr t_lo, t_lo, b.hi, lsr t_c2
+ // lsr t_hi, b.hi, c.lo
+ // a.lo = t_lo
+ // a.hi = t_hi
+ case InstArithmetic::Ashr: {
+ // a=b>>c (signed) ==> ...
+ // Ashr is similar, but the sub t_c2, c.lo, #32 should set flags,
+ // and the next orr should be conditioned on PLUS. The last two
+ // right shifts should also be arithmetic.
+ bool IsAshr = Inst->getOp() == InstArithmetic::Ashr;
+ Variable *T_Lo = makeReg(IceType_i32);
+ Variable *Src1RLo = legalizeToVar(Src1Lo);
+ Constant *ThirtyTwo = Ctx->getConstantInt32(32);
+ Variable *T_C1 = makeReg(IceType_i32);
+ Variable *T_C2 = makeReg(IceType_i32);
+ _rsb(T_C1, Src1RLo, ThirtyTwo);
+ _lsr(T_Lo, Src0RLo, Src1RLo);
+ _orr(T_Lo, T_Lo, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
+ OperandARM32::LSL, T_C1));
+ OperandARM32::ShiftKind RShiftKind;
+ CondARM32::Cond Pred;
+ if (IsAshr) {
+ _subs(T_C2, Src1RLo, ThirtyTwo);
+ RShiftKind = OperandARM32::ASR;
+ Pred = CondARM32::PL;
+ } else {
+ _sub(T_C2, Src1RLo, ThirtyTwo);
+ RShiftKind = OperandARM32::LSR;
+ Pred = CondARM32::AL;
+ }
+ _orr(T_Lo, T_Lo, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
+ RShiftKind, T_C2),
+ Pred);
+ _mov(DestLo, T_Lo);
+ Variable *T_Hi = makeReg(IceType_i32);
+ _mov(T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
+ RShiftKind, Src1RLo));
+ _mov(DestHi, T_Hi);
+ } break;
case InstArithmetic::Udiv:
case InstArithmetic::Sdiv:
case InstArithmetic::Urem:
@@ -1122,13 +1203,16 @@
_mov(Dest, T);
} break;
case InstArithmetic::Shl:
- UnimplementedError(Func->getContext()->getFlags());
+ _lsl(T, Src0R, Src1);
+ _mov(Dest, T);
break;
case InstArithmetic::Lshr:
- UnimplementedError(Func->getContext()->getFlags());
+ _lsr(T, Src0R, Src1);
+ _mov(Dest, T);
break;
case InstArithmetic::Ashr:
- UnimplementedError(Func->getContext()->getFlags());
+ _asr(T, Src0R, Src1);
+ _mov(Dest, T);
break;
case InstArithmetic::Udiv:
UnimplementedError(Func->getContext()->getFlags());
@@ -1311,20 +1395,123 @@
void TargetARM32::lowerCast(const InstCast *Inst) {
InstCast::OpKind CastKind = Inst->getCastKind();
+ Variable *Dest = Inst->getDest();
+ Operand *Src0 = Inst->getSrc(0);
switch (CastKind) {
default:
Func->setError("Cast type not supported");
return;
case InstCast::Sext: {
- UnimplementedError(Func->getContext()->getFlags());
+ if (isVectorType(Dest->getType())) {
+ UnimplementedError(Func->getContext()->getFlags());
+ } else if (Dest->getType() == IceType_i64) {
+ // t1=sxtb src; t2= mov t1 asr #31; dst.lo=t1; dst.hi=t2
+ Constant *ShiftAmt = Ctx->getConstantInt32(31);
+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+ Variable *T_Lo = makeReg(DestLo->getType());
+ if (Src0->getType() == IceType_i32) {
+ Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
+ _mov(T_Lo, Src0RF);
+ } else if (Src0->getType() == IceType_i1) {
+ Variable *Src0R = legalizeToVar(Src0);
+ _lsl(T_Lo, Src0R, ShiftAmt);
+ _asr(T_Lo, T_Lo, ShiftAmt);
+ } else {
+ Variable *Src0R = legalizeToVar(Src0);
+ _sxt(T_Lo, Src0R);
+ }
+ _mov(DestLo, T_Lo);
+ Variable *T_Hi = makeReg(DestHi->getType());
+ if (Src0->getType() != IceType_i1) {
+ _mov(T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, T_Lo,
+ OperandARM32::ASR, ShiftAmt));
+ } else {
+ // For i1, the asr instruction is already done above.
+ _mov(T_Hi, T_Lo);
+ }
+ _mov(DestHi, T_Hi);
+ } else if (Src0->getType() == IceType_i1) {
+ // GPR registers are 32-bit, so just use 31 as dst_bitwidth - 1.
+ // lsl t1, src_reg, 31
+ // asr t1, t1, 31
+ // dst = t1
+ Variable *Src0R = legalizeToVar(Src0);
+ Constant *ShiftAmt = Ctx->getConstantInt32(31);
+ Variable *T = makeReg(Dest->getType());
+ _lsl(T, Src0R, ShiftAmt);
+ _asr(T, T, ShiftAmt);
+ _mov(Dest, T);
+ } else {
+ // t1 = sxt src; dst = t1
+ Variable *Src0R = legalizeToVar(Src0);
+ Variable *T = makeReg(Dest->getType());
+ _sxt(T, Src0R);
+ _mov(Dest, T);
+ }
break;
}
case InstCast::Zext: {
- UnimplementedError(Func->getContext()->getFlags());
+ if (isVectorType(Dest->getType())) {
+ UnimplementedError(Func->getContext()->getFlags());
+ } else if (Dest->getType() == IceType_i64) {
+ // t1=uxtb src; dst.lo=t1; dst.hi=0
+ Constant *Zero = Ctx->getConstantZero(IceType_i32);
+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+ Variable *T_Lo = makeReg(DestLo->getType());
+ // i32 and i1 can just take up the whole register.
+ // i32 doesn't need uxt, while i1 will have an and mask later anyway.
+ if (Src0->getType() == IceType_i32 || Src0->getType() == IceType_i1) {
+ Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
+ _mov(T_Lo, Src0RF);
+ } else {
+ Variable *Src0R = legalizeToVar(Src0);
+ _uxt(T_Lo, Src0R);
+ }
+ if (Src0->getType() == IceType_i1) {
+ Constant *One = Ctx->getConstantInt32(1);
+ _and(T_Lo, T_Lo, One);
+ }
+ _mov(DestLo, T_Lo);
+ Variable *T_Hi = makeReg(DestLo->getType());
+ _mov(T_Hi, Zero);
+ _mov(DestHi, T_Hi);
+ } else if (Src0->getType() == IceType_i1) {
+ // t = Src0; t &= 1; Dest = t
+ Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
+ Constant *One = Ctx->getConstantInt32(1);
+ Variable *T = makeReg(Dest->getType());
+ // Just use _mov instead of _uxt since all registers are 32-bit.
+ // _uxt requires the source to be a register so could have required
+ // a _mov from legalize anyway.
+ _mov(T, Src0RF);
+ _and(T, T, One);
+ _mov(Dest, T);
+ } else {
+ // t1 = uxt src; dst = t1
+ Variable *Src0R = legalizeToVar(Src0);
+ Variable *T = makeReg(Dest->getType());
+ _uxt(T, Src0R);
+ _mov(Dest, T);
+ }
break;
}
case InstCast::Trunc: {
- UnimplementedError(Func->getContext()->getFlags());
+ if (isVectorType(Dest->getType())) {
+ UnimplementedError(Func->getContext()->getFlags());
+ } else {
+ Operand *Src0 = Inst->getSrc(0);
+ if (Src0->getType() == IceType_i64)
+ Src0 = loOperand(Src0);
+ Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
+ // t1 = trunc Src0RF; Dest = t1
+ Variable *T = makeReg(Dest->getType());
+ _mov(T, Src0RF);
+ if (Dest->getType() == IceType_i1)
+ _and(T, T, Ctx->getConstantInt1(1));
+ _mov(Dest, T);
+ }
break;
}
case InstCast::Fptrunc:
@@ -1348,6 +1535,12 @@
break;
}
case InstCast::Bitcast: {
+ Operand *Src0 = Inst->getSrc(0);
+ if (Dest->getType() == Src0->getType()) {
+ InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
+ lowerAssign(Assign);
+ return;
+ }
UnimplementedError(Func->getContext()->getFlags());
break;
}
@@ -1469,20 +1662,20 @@
//
// We'll go with the LLVM way for now, since it's shorter and has just as
// few dependencies.
- int32_t ShiftAmount = 32 - getScalarIntBitWidth(Src0->getType());
- assert(ShiftAmount >= 0);
+ int32_t ShiftAmt = 32 - getScalarIntBitWidth(Src0->getType());
+ assert(ShiftAmt >= 0);
Constant *ShiftConst = nullptr;
Variable *Src0R = nullptr;
Variable *T = makeReg(IceType_i32);
- if (ShiftAmount) {
- ShiftConst = Ctx->getConstantInt32(ShiftAmount);
+ if (ShiftAmt) {
+ ShiftConst = Ctx->getConstantInt32(ShiftAmt);
Src0R = makeReg(IceType_i32);
_lsl(Src0R, legalizeToVar(Src0), ShiftConst);
} else {
Src0R = legalizeToVar(Src0);
}
_mov(T, Zero);
- if (ShiftAmount) {
+ if (ShiftAmt) {
Variable *Src1R = legalizeToVar(Src1);
OperandARM32FlexReg *Src1RShifted = OperandARM32FlexReg::create(
Func, IceType_i32, Src1R, OperandARM32::LSL, ShiftConst);