Subzero. Implements x86-64 lowerCall.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1266673003.
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 1fcf0b9..f5d4ead 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -21,6 +21,14 @@
 
 namespace Ice {
 
+//------------------------------------------------------------------------------
+//      ______   ______     ______     __     ______   ______
+//     /\__  _\ /\  == \   /\  __ \   /\ \   /\__  _\ /\  ___\
+//     \/_/\ \/ \ \  __<   \ \  __ \  \ \ \  \/_/\ \/ \ \___  \
+//        \ \_\  \ \_\ \_\  \ \_\ \_\  \ \_\    \ \_\  \/\_____\
+//         \/_/   \/_/ /_/   \/_/\/_/   \/_/     \/_/   \/_____/
+//
+//------------------------------------------------------------------------------
 namespace X86Internal {
 const MachineTraits<TargetX8664>::TableFcmpType
     MachineTraits<TargetX8664>::TableFcmp[] = {
@@ -81,6 +89,286 @@
 
 } // end of namespace X86Internal
 
+//------------------------------------------------------------------------------
+//     __      ______  __     __  ______  ______  __  __   __  ______
+//    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
+//    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
+//     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
+//      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
+//
+//------------------------------------------------------------------------------
+namespace {
+static inline TargetX8664::Traits::RegisterSet::AllRegisters
+getRegisterForXmmArgNum(uint32_t ArgNum) {
+  assert(ArgNum < TargetX8664::Traits::X86_MAX_XMM_ARGS);
+  return static_cast<TargetX8664::Traits::RegisterSet::AllRegisters>(
+      TargetX8664::Traits::RegisterSet::Reg_xmm0 + ArgNum);
+}
+
+static inline TargetX8664::Traits::RegisterSet::AllRegisters
+getRegisterForGprArgNum(uint32_t ArgNum) {
+  assert(ArgNum < TargetX8664::Traits::X86_MAX_GPR_ARGS);
+  static const TargetX8664::Traits::RegisterSet::AllRegisters GprForArgNum[] = {
+      TargetX8664::Traits::RegisterSet::Reg_edi,
+      TargetX8664::Traits::RegisterSet::Reg_esi,
+      TargetX8664::Traits::RegisterSet::Reg_edx,
+      TargetX8664::Traits::RegisterSet::Reg_ecx,
+      TargetX8664::Traits::RegisterSet::Reg_r8d,
+      TargetX8664::Traits::RegisterSet::Reg_r9d,
+  };
+  static_assert(llvm::array_lengthof(GprForArgNum) ==
+                    TargetX8664::TargetX8664::Traits::X86_MAX_GPR_ARGS,
+                "Mismatch between MAX_GPR_ARGS and GprForArgNum.");
+  return GprForArgNum[ArgNum];
+}
+
+// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
+// OperandList in lowerCall. std::max() was supposed to work, but it doesn't.
+constexpr SizeT constexprMax(SizeT S0, SizeT S1) { return S0 < S1 ? S1 : S0; }
+
+} // end of anonymous namespace
+
+void TargetX8664::lowerCall(const InstCall *Instr) {
+  // x86-64 calling convention:
+  //
+  // * At the point before the call, the stack must be aligned to 16
+  // bytes.
+  //
+  // * The first eight arguments of vector/fp type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers %xmm0 - %xmm7.
+  //
+  // * The first six arguments of integer types, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers %rdi, %rsi, %rdx, %rcx, %r8, and %r9.
+  //
+  // * Other arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at
+  // the lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next
+  // highest multiple of 16 bytes.  Other stack arguments are aligned to
+  // 8 bytes.
+  //
+  // This intends to match the section "Function Calling Sequence" of the
+  // document "System V Application Binary Interface."
+  NeedsStackAlignment = true;
+
+  using OperandList =
+      llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
+                                                Traits::X86_MAX_GPR_ARGS)>;
+  OperandList XmmArgs;
+  OperandList GprArgs;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) && XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else if (isScalarFloatingType(Ty) &&
+               XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else if (isScalarIntegerType(Ty) &&
+               GprArgs.size() < Traits::X86_MAX_GPR_ARGS) {
+      GprArgs.push_back(Arg);
+    } else {
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(
+          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+
+  // Adjust the parameter area so that the stack is aligned.  It is
+  // assumed that the stack is already aligned at the start of the
+  // calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
+  //
+  // TODO: If for some reason the call instruction gets dead-code
+  // eliminated after lowering, we would need to ensure that the
+  // pre-call and the post-call esp adjustment get eliminated as well.
+  if (ParameterAreaSizeBytes) {
+    _adjust_stack(ParameterAreaSizeBytes);
+  }
+
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
+    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
+  }
+
+  // Copy arguments to be passed in registers to the appropriate
+  // registers.
+  // TODO: Investigate the impact of lowering arguments passed in
+  // registers after lowering stack arguments as opposed to the other
+  // way around.  Lowering register arguments after stack arguments may
+  // reduce register pressure.  On the other hand, lowering register
+  // arguments first (before stack arguments) may result in more compact
+  // code, as the memory operand displacements may end up being smaller
+  // before any stack adjustment is done.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    Variable *Reg = legalizeToReg(XmmArgs[i], getRegisterForXmmArgNum(i));
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+
+  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
+    Variable *Reg = legalizeToReg(GprArgs[i], getRegisterForGprArgNum(i));
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+
+  // Generate the call instruction.  Assign its result to a temporary
+  // with high register allocation weight.
+  Variable *Dest = Instr->getDest();
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (Dest->getType()) {
+    case IceType_NUM:
+      llvm_unreachable("Invalid Call dest type");
+      break;
+    case IceType_void:
+      break;
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+    case IceType_i32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i64:
+      // TODO(jpp): return i64 in a GPR.
+      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      break;
+    case IceType_f32:
+    case IceType_f64:
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_xmm0);
+      break;
+    }
+  }
+
+  Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
+  Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
+  Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+
+  // Add the appropriate offset to esp.  The call instruction takes care
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
+    Variable *Esp =
+        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+    _add(Esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
+  }
+
+  // Insert a register-kill pseudo instruction.
+  Context.insert(InstFakeKill::create(Func, NewCall));
+
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
+    Context.insert(FakeUse);
+  }
+
+  if (!Dest)
+    return;
+
+  assert(ReturnReg && "x86-64 always returns value on registers.");
+
+  // Assign the result of the call to Dest.
+  if (ReturnRegHi) {
+    assert(Dest->getType() == IceType_i64);
+    split64(Dest);
+    Variable *DestLo = Dest->getLo();
+    Variable *DestHi = Dest->getHi();
+    _mov(DestLo, ReturnReg);
+    _mov(DestHi, ReturnRegHi);
+    return;
+  }
+
+  assert(Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64 ||
+         Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+         Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+         isVectorType(Dest->getType()));
+
+  if (isScalarFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
+    _movp(Dest, ReturnReg);
+  } else {
+    _mov(Dest, ReturnReg);
+  }
+}
+
+void TargetDataX8664::lowerJumpTables() {
+  switch (Ctx->getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    for (const JumpTableData &JumpTable : Ctx->getJumpTables())
+      // TODO(jpp): not 386.
+      Writer->writeJumpTable(JumpTable, llvm::ELF::R_386_32);
+  } break;
+  case FT_Asm:
+    // Already emitted from Cfg
+    break;
+  case FT_Iasm: {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    for (const JumpTableData &JT : Ctx->getJumpTables()) {
+      Str << "\t.section\t.rodata." << JT.getFunctionName()
+          << "$jumptable,\"a\",@progbits\n";
+      Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
+      Str << InstJumpTable::makeName(JT.getFunctionName(), JT.getId()) << ":";
+
+      // On X8664 ILP32 pointers are 32-bit hence the use of .long
+      for (intptr_t TargetOffset : JT.getTargetOffsets())
+        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
+      Str << "\n";
+    }
+  } break;
+  }
+}
+
 namespace {
 template <typename T> struct PoolTypeConverter {};
 
@@ -236,36 +524,6 @@
   Str << "\n";
 }
 
-void TargetDataX8664::lowerJumpTables() {
-  switch (Ctx->getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    for (const JumpTableData &JT : Ctx->getJumpTables())
-      // TODO(jpp): not 386.
-      Writer->writeJumpTable(JT, llvm::ELF::R_386_32);
-  } break;
-  case FT_Asm:
-    // Already emitted from Cfg
-    break;
-  case FT_Iasm: {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    for (const JumpTableData &JT : Ctx->getJumpTables()) {
-      Str << "\t.section\t.rodata." << JT.getFunctionName()
-          << "$jumptable,\"a\",@progbits\n";
-      Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
-      Str << InstJumpTable::makeName(JT.getFunctionName(), JT.getId()) << ":";
-
-      // On X8664 ILP32 pointers are 32-bit hence the use of .long
-      for (intptr_t TargetOffset : JT.getTargetOffsets())
-        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
-      Str << "\n";
-    }
-  } break;
-  }
-}
-
 void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
   switch (Ctx->getFlags().getOutFileType()) {