Subzero. Implements x86-64 lowerCall.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1266673003.
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 1bcb857..0a426c7 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -881,20 +881,19 @@
 JumpTableDataList GlobalContext::getJumpTables() {
   JumpTableDataList JumpTables(*getJumpTableList());
   if (getFlags().shouldReorderPooledConstants()) {
-  // If reorder-pooled-constants option is set to true, we need to shuffle the
-  // constant pool before emitting it.
-    RandomShuffle(JumpTables.begin(), JumpTables.end(), [this](uint64_t N) {
-      return (uint32_t)getRNG().next(N);
-    });
+    // If reorder-pooled-constants option is set to true, we need to shuffle the
+    // constant pool before emitting it.
+    RandomShuffle(JumpTables.begin(), JumpTables.end(),
+                  [this](uint64_t N) { return (uint32_t)getRNG().next(N); });
   } else {
     // Make order deterministic by sorting into functions and then ID of the
     // jump table within that function.
-    std::sort(JumpTables.begin(), JumpTables.end(), [](const JumpTableData &A,
-                                                       const JumpTableData &B) {
-      if (A.getFunctionName() != B.getFunctionName())
-        return A.getFunctionName() < B.getFunctionName();
-      return A.getId() < B.getId();
-    });
+    std::sort(JumpTables.begin(), JumpTables.end(),
+              [](const JumpTableData &A, const JumpTableData &B) {
+                if (A.getFunctionName() != B.getFunctionName())
+                  return A.getFunctionName() < B.getFunctionName();
+                return A.getId() < B.getId();
+              });
   }
   return JumpTables;
 }
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 44939dd..d765660 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -20,9 +20,6 @@
 ///
 /// ::Ice::X8632::Traits::Insts::Mov::create
 ///
-/// In the future, this file might be used to declare X8632 specific
-/// instructions (e.g., FLD, and FSTP.)
-///
 //===----------------------------------------------------------------------===//
 
 #ifndef SUBZERO_SRC_ICEINSTX8632_H
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index a42e122..8adfab2 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -21,6 +21,14 @@
 
 namespace Ice {
 
+//------------------------------------------------------------------------------
+//      ______   ______     ______     __     ______   ______
+//     /\__  _\ /\  == \   /\  __ \   /\ \   /\__  _\ /\  ___\
+//     \/_/\ \/ \ \  __<   \ \  __ \  \ \ \  \/_/\ \/ \ \___  \
+//        \ \_\  \ \_\ \_\  \ \_\ \_\  \ \_\    \ \_\  \/\_____\
+//         \/_/   \/_/ /_/   \/_/\/_/   \/_/     \/_/   \/_____/
+//
+//------------------------------------------------------------------------------
 namespace X86Internal {
 const MachineTraits<TargetX8632>::TableFcmpType
     MachineTraits<TargetX8632>::TableFcmp[] = {
@@ -399,4 +407,214 @@
 } // end of namespace dummy3
 } // end of anonymous namespace
 
+//------------------------------------------------------------------------------
+//     __      ______  __     __  ______  ______  __  __   __  ______
+//    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
+//    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
+//     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
+//      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
+//
+//------------------------------------------------------------------------------
+void TargetX8632::lowerCall(const InstCall *Instr) {
+  // x86-32 calling convention:
+  //
+  // * At the point before the call, the stack must be aligned to 16
+  // bytes.
+  //
+  // * The first four arguments of vector type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers xmm0 - xmm3.
+  //
+  // * Other arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at
+  // the lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next
+  // highest multiple of 16 bytes.  Other stack arguments are aligned to
+  // 4 bytes.
+  //
+  // This intends to match the section "IA-32 Function Calling
+  // Convention" of the document "OS X ABI Function Call Guide" by
+  // Apple.
+  NeedsStackAlignment = true;
+
+  typedef std::vector<Operand *> OperandList;
+  OperandList XmmArgs;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) && XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else {
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp =
+          Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(
+          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+
+  // Adjust the parameter area so that the stack is aligned.  It is
+  // assumed that the stack is already aligned at the start of the
+  // calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
+  //
+  // TODO: If for some reason the call instruction gets dead-code
+  // eliminated after lowering, we would need to ensure that the
+  // pre-call and the post-call esp adjustment get eliminated as well.
+  if (ParameterAreaSizeBytes) {
+    _adjust_stack(ParameterAreaSizeBytes);
+  }
+
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
+    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
+  }
+
+  // Copy arguments to be passed in registers to the appropriate
+  // registers.
+  // TODO: Investigate the impact of lowering arguments passed in
+  // registers after lowering stack arguments as opposed to the other
+  // way around.  Lowering register arguments after stack arguments may
+  // reduce register pressure.  On the other hand, lowering register
+  // arguments first (before stack arguments) may result in more compact
+  // code, as the memory operand displacements may end up being smaller
+  // before any stack adjustment is done.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    Variable *Reg =
+        legalizeToReg(XmmArgs[i], Traits::RegisterSet::Reg_xmm0 + i);
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+  // Generate the call instruction.  Assign its result to a temporary
+  // with high register allocation weight.
+  Variable *Dest = Instr->getDest();
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (Dest->getType()) {
+    case IceType_NUM:
+      llvm_unreachable("Invalid Call dest type");
+      break;
+    case IceType_void:
+      break;
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+    case IceType_i32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i64:
+      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      break;
+    case IceType_f32:
+    case IceType_f64:
+      // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
+      // the fstp instruction.
+      break;
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_xmm0);
+      break;
+    }
+  }
+  Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
+  Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
+  Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+
+  // Add the appropriate offset to esp.  The call instruction takes care
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
+    Variable *esp =
+        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+    _add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
+  }
+
+  // Insert a register-kill pseudo instruction.
+  Context.insert(InstFakeKill::create(Func, NewCall));
+
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
+    Context.insert(FakeUse);
+  }
+
+  if (!Dest)
+    return;
+
+  // Assign the result of the call to Dest.
+  if (ReturnReg) {
+    if (ReturnRegHi) {
+      assert(Dest->getType() == IceType_i64);
+      split64(Dest);
+      Variable *DestLo = Dest->getLo();
+      Variable *DestHi = Dest->getHi();
+      _mov(DestLo, ReturnReg);
+      _mov(DestHi, ReturnRegHi);
+    } else {
+      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+             isVectorType(Dest->getType()));
+      if (isVectorType(Dest->getType())) {
+        _movp(Dest, ReturnReg);
+      } else {
+        _mov(Dest, ReturnReg);
+      }
+    }
+  } else if (isScalarFloatingType(Dest->getType())) {
+    // Special treatment for an FP function which returns its result in
+    // st(0).
+    // If Dest ends up being a physical xmm register, the fstp emit code
+    // will route st(0) through a temporary stack slot.
+    _fstp(Dest);
+    // Create a fake use of Dest in case it actually isn't used,
+    // because st(0) still needs to be popped.
+    Context.insert(InstFakeUse::create(Func, Dest));
+  }
+}
+
 } // end of namespace Ice
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index f49e673..d086135 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -41,6 +41,9 @@
 
   static TargetX8632 *create(Cfg *Func) { return new TargetX8632(Func); }
 
+protected:
+  void lowerCall(const InstCall *Instr) override;
+
 private:
   friend class ::Ice::X86Internal::TargetX86Base<TargetX8632>;
 
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 1fcf0b9..f5d4ead 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -21,6 +21,14 @@
 
 namespace Ice {
 
+//------------------------------------------------------------------------------
+//      ______   ______     ______     __     ______   ______
+//     /\__  _\ /\  == \   /\  __ \   /\ \   /\__  _\ /\  ___\
+//     \/_/\ \/ \ \  __<   \ \  __ \  \ \ \  \/_/\ \/ \ \___  \
+//        \ \_\  \ \_\ \_\  \ \_\ \_\  \ \_\    \ \_\  \/\_____\
+//         \/_/   \/_/ /_/   \/_/\/_/   \/_/     \/_/   \/_____/
+//
+//------------------------------------------------------------------------------
 namespace X86Internal {
 const MachineTraits<TargetX8664>::TableFcmpType
     MachineTraits<TargetX8664>::TableFcmp[] = {
@@ -81,6 +89,286 @@
 
 } // end of namespace X86Internal
 
+//------------------------------------------------------------------------------
+//     __      ______  __     __  ______  ______  __  __   __  ______
+//    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
+//    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
+//     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
+//      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
+//
+//------------------------------------------------------------------------------
+namespace {
+static inline TargetX8664::Traits::RegisterSet::AllRegisters
+getRegisterForXmmArgNum(uint32_t ArgNum) {
+  assert(ArgNum < TargetX8664::Traits::X86_MAX_XMM_ARGS);
+  return static_cast<TargetX8664::Traits::RegisterSet::AllRegisters>(
+      TargetX8664::Traits::RegisterSet::Reg_xmm0 + ArgNum);
+}
+
+static inline TargetX8664::Traits::RegisterSet::AllRegisters
+getRegisterForGprArgNum(uint32_t ArgNum) {
+  assert(ArgNum < TargetX8664::Traits::X86_MAX_GPR_ARGS);
+  static const TargetX8664::Traits::RegisterSet::AllRegisters GprForArgNum[] = {
+      TargetX8664::Traits::RegisterSet::Reg_edi,
+      TargetX8664::Traits::RegisterSet::Reg_esi,
+      TargetX8664::Traits::RegisterSet::Reg_edx,
+      TargetX8664::Traits::RegisterSet::Reg_ecx,
+      TargetX8664::Traits::RegisterSet::Reg_r8d,
+      TargetX8664::Traits::RegisterSet::Reg_r9d,
+  };
+  static_assert(llvm::array_lengthof(GprForArgNum) ==
+                    TargetX8664::TargetX8664::Traits::X86_MAX_GPR_ARGS,
+                "Mismatch between MAX_GPR_ARGS and GprForArgNum.");
+  return GprForArgNum[ArgNum];
+}
+
+// constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
+// OperandList in lowerCall. std::max() was supposed to work, but it doesn't.
+constexpr SizeT constexprMax(SizeT S0, SizeT S1) { return S0 < S1 ? S1 : S0; }
+
+} // end of anonymous namespace
+
+void TargetX8664::lowerCall(const InstCall *Instr) {
+  // x86-64 calling convention:
+  //
+  // * At the point before the call, the stack must be aligned to 16
+  // bytes.
+  //
+  // * The first eight arguments of vector/fp type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers %xmm0 - %xmm7.
+  //
+  // * The first six arguments of integer types, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers %rdi, %rsi, %rdx, %rcx, %r8, and %r9.
+  //
+  // * Other arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at
+  // the lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next
+  // highest multiple of 16 bytes.  Other stack arguments are aligned to
+  // 8 bytes.
+  //
+  // This intends to match the section "Function Calling Sequence" of the
+  // document "System V Application Binary Interface."
+  NeedsStackAlignment = true;
+
+  using OperandList =
+      llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
+                                                Traits::X86_MAX_GPR_ARGS)>;
+  OperandList XmmArgs;
+  OperandList GprArgs;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) && XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else if (isScalarFloatingType(Ty) &&
+               XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else if (isScalarIntegerType(Ty) &&
+               GprArgs.size() < Traits::X86_MAX_GPR_ARGS) {
+      GprArgs.push_back(Arg);
+    } else {
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(
+          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+
+  // Adjust the parameter area so that the stack is aligned.  It is
+  // assumed that the stack is already aligned at the start of the
+  // calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
+  //
+  // TODO: If for some reason the call instruction gets dead-code
+  // eliminated after lowering, we would need to ensure that the
+  // pre-call and the post-call esp adjustment get eliminated as well.
+  if (ParameterAreaSizeBytes) {
+    _adjust_stack(ParameterAreaSizeBytes);
+  }
+
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
+    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
+  }
+
+  // Copy arguments to be passed in registers to the appropriate
+  // registers.
+  // TODO: Investigate the impact of lowering arguments passed in
+  // registers after lowering stack arguments as opposed to the other
+  // way around.  Lowering register arguments after stack arguments may
+  // reduce register pressure.  On the other hand, lowering register
+  // arguments first (before stack arguments) may result in more compact
+  // code, as the memory operand displacements may end up being smaller
+  // before any stack adjustment is done.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    Variable *Reg = legalizeToReg(XmmArgs[i], getRegisterForXmmArgNum(i));
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+
+  for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
+    Variable *Reg = legalizeToReg(GprArgs[i], getRegisterForGprArgNum(i));
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+
+  // Generate the call instruction.  Assign its result to a temporary
+  // with high register allocation weight.
+  Variable *Dest = Instr->getDest();
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (Dest->getType()) {
+    case IceType_NUM:
+      llvm_unreachable("Invalid Call dest type");
+      break;
+    case IceType_void:
+      break;
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+    case IceType_i32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
+      break;
+    case IceType_i64:
+      // TODO(jpp): return i64 in a GPR.
+      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      break;
+    case IceType_f32:
+    case IceType_f64:
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_xmm0);
+      break;
+    }
+  }
+
+  Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
+  Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
+  Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+
+  // Add the appropriate offset to esp.  The call instruction takes care
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
+    Variable *Esp =
+        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+    _add(Esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
+  }
+
+  // Insert a register-kill pseudo instruction.
+  Context.insert(InstFakeKill::create(Func, NewCall));
+
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
+    Context.insert(FakeUse);
+  }
+
+  if (!Dest)
+    return;
+
+  assert(ReturnReg && "x86-64 always returns value on registers.");
+
+  // Assign the result of the call to Dest.
+  if (ReturnRegHi) {
+    assert(Dest->getType() == IceType_i64);
+    split64(Dest);
+    Variable *DestLo = Dest->getLo();
+    Variable *DestHi = Dest->getHi();
+    _mov(DestLo, ReturnReg);
+    _mov(DestHi, ReturnRegHi);
+    return;
+  }
+
+  assert(Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64 ||
+         Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+         Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+         isVectorType(Dest->getType()));
+
+  if (isScalarFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
+    _movp(Dest, ReturnReg);
+  } else {
+    _mov(Dest, ReturnReg);
+  }
+}
+
+void TargetDataX8664::lowerJumpTables() {
+  switch (Ctx->getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    for (const JumpTableData &JumpTable : Ctx->getJumpTables())
+      // TODO(jpp): not 386.
+      Writer->writeJumpTable(JumpTable, llvm::ELF::R_386_32);
+  } break;
+  case FT_Asm:
+    // Already emitted from Cfg
+    break;
+  case FT_Iasm: {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    for (const JumpTableData &JT : Ctx->getJumpTables()) {
+      Str << "\t.section\t.rodata." << JT.getFunctionName()
+          << "$jumptable,\"a\",@progbits\n";
+      Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
+      Str << InstJumpTable::makeName(JT.getFunctionName(), JT.getId()) << ":";
+
+      // On X8664 ILP32 pointers are 32-bit hence the use of .long
+      for (intptr_t TargetOffset : JT.getTargetOffsets())
+        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
+      Str << "\n";
+    }
+  } break;
+  }
+}
+
 namespace {
 template <typename T> struct PoolTypeConverter {};
 
@@ -236,36 +524,6 @@
   Str << "\n";
 }
 
-void TargetDataX8664::lowerJumpTables() {
-  switch (Ctx->getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    for (const JumpTableData &JT : Ctx->getJumpTables())
-      // TODO(jpp): not 386.
-      Writer->writeJumpTable(JT, llvm::ELF::R_386_32);
-  } break;
-  case FT_Asm:
-    // Already emitted from Cfg
-    break;
-  case FT_Iasm: {
-    if (!BuildDefs::dump())
-      return;
-    Ostream &Str = Ctx->getStrEmit();
-    for (const JumpTableData &JT : Ctx->getJumpTables()) {
-      Str << "\t.section\t.rodata." << JT.getFunctionName()
-          << "$jumptable,\"a\",@progbits\n";
-      Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
-      Str << InstJumpTable::makeName(JT.getFunctionName(), JT.getId()) << ":";
-
-      // On X8664 ILP32 pointers are 32-bit hence the use of .long
-      for (intptr_t TargetOffset : JT.getTargetOffsets())
-        Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
-      Str << "\n";
-    }
-  } break;
-  }
-}
-
 void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
   switch (Ctx->getFlags().getOutFileType()) {
diff --git a/src/IceTargetLoweringX8664.h b/src/IceTargetLoweringX8664.h
index 8c4329d..1e012b5 100644
--- a/src/IceTargetLoweringX8664.h
+++ b/src/IceTargetLoweringX8664.h
@@ -38,6 +38,9 @@
 public:
   static TargetX8664 *create(Cfg *Func) { return new TargetX8664(Func); }
 
+protected:
+  void lowerCall(const InstCall *Instr) override;
+
 private:
   friend class ::Ice::X86Internal::TargetX86Base<TargetX8664>;
 
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index cc22171..89fc203 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -439,7 +439,9 @@
   }
 
   /// The maximum number of arguments to pass in XMM registers
-  static const uint32_t X86_MAX_XMM_ARGS = 4;
+  static const uint32_t X86_MAX_XMM_ARGS = 8;
+  /// The maximum number of arguments to pass in GPR registers
+  static const uint32_t X86_MAX_GPR_ARGS = 6;
   /// The number of bits in a byte
   static const uint32_t X86_CHAR_BIT = 8;
   /// Stack alignment. This is defined in IceTargetLoweringX8664.cpp because it
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index d89d747..1e33a96 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -83,6 +83,7 @@
   size_t typeWidthInBytesOnStack(Type Ty) const override {
     // Round up to the next multiple of 4 bytes.  In particular, i1,
     // i8, and i16 are rounded up to 4 bytes.
+    // TODO(jpp): this needs to round to multiples of 8 bytes in x86-64.
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
 
@@ -127,7 +128,6 @@
   void lowerArithmetic(const InstArithmetic *Inst) override;
   void lowerAssign(const InstAssign *Inst) override;
   void lowerBr(const InstBr *Inst) override;
-  void lowerCall(const InstCall *Inst) override;
   void lowerCast(const InstCast *Inst) override;
   void lowerExtractElement(const InstExtractElement *Inst) override;
   void lowerFcmp(const InstFcmp *Inst) override;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 3ec094e..245861c 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -2126,209 +2126,6 @@
 }
 
 template <class Machine>
-void TargetX86Base<Machine>::lowerCall(const InstCall *Instr) {
-  // x86-32 calling convention:
-  //
-  // * At the point before the call, the stack must be aligned to 16
-  // bytes.
-  //
-  // * The first four arguments of vector type, regardless of their
-  // position relative to the other arguments in the argument list, are
-  // placed in registers xmm0 - xmm3.
-  //
-  // * Other arguments are pushed onto the stack in right-to-left order,
-  // such that the left-most argument ends up on the top of the stack at
-  // the lowest memory address.
-  //
-  // * Stack arguments of vector type are aligned to start at the next
-  // highest multiple of 16 bytes.  Other stack arguments are aligned to
-  // 4 bytes.
-  //
-  // This intends to match the section "IA-32 Function Calling
-  // Convention" of the document "OS X ABI Function Call Guide" by
-  // Apple.
-  NeedsStackAlignment = true;
-
-  typedef std::vector<Operand *> OperandList;
-  OperandList XmmArgs;
-  OperandList StackArgs, StackArgLocations;
-  uint32_t ParameterAreaSizeBytes = 0;
-
-  // Classify each argument operand according to the location where the
-  // argument is passed.
-  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    Type Ty = Arg->getType();
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) && XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
-      XmmArgs.push_back(Arg);
-    } else {
-      StackArgs.push_back(Arg);
-      if (isVectorType(Arg->getType())) {
-        ParameterAreaSizeBytes =
-            Traits::applyStackAlignment(ParameterAreaSizeBytes);
-      }
-      Variable *esp =
-          Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
-      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
-      StackArgLocations.push_back(
-          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
-      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
-    }
-  }
-
-  // Adjust the parameter area so that the stack is aligned.  It is
-  // assumed that the stack is already aligned at the start of the
-  // calling sequence.
-  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
-
-  // Subtract the appropriate amount for the argument area.  This also
-  // takes care of setting the stack adjustment during emission.
-  //
-  // TODO: If for some reason the call instruction gets dead-code
-  // eliminated after lowering, we would need to ensure that the
-  // pre-call and the post-call esp adjustment get eliminated as well.
-  if (ParameterAreaSizeBytes) {
-    _adjust_stack(ParameterAreaSizeBytes);
-  }
-
-  // Copy arguments that are passed on the stack to the appropriate
-  // stack locations.
-  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
-    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
-  }
-
-  // Copy arguments to be passed in registers to the appropriate
-  // registers.
-  // TODO: Investigate the impact of lowering arguments passed in
-  // registers after lowering stack arguments as opposed to the other
-  // way around.  Lowering register arguments after stack arguments may
-  // reduce register pressure.  On the other hand, lowering register
-  // arguments first (before stack arguments) may result in more compact
-  // code, as the memory operand displacements may end up being smaller
-  // before any stack adjustment is done.
-  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
-    Variable *Reg =
-        legalizeToReg(XmmArgs[i], Traits::RegisterSet::Reg_xmm0 + i);
-    // Generate a FakeUse of register arguments so that they do not get
-    // dead code eliminated as a result of the FakeKill of scratch
-    // registers after the call.
-    Context.insert(InstFakeUse::create(Func, Reg));
-  }
-  // Generate the call instruction.  Assign its result to a temporary
-  // with high register allocation weight.
-  Variable *Dest = Instr->getDest();
-  // ReturnReg doubles as ReturnRegLo as necessary.
-  Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
-  if (Dest) {
-    switch (Dest->getType()) {
-    case IceType_NUM:
-      llvm_unreachable("Invalid Call dest type");
-      break;
-    case IceType_void:
-      break;
-    case IceType_i1:
-    case IceType_i8:
-    case IceType_i16:
-    case IceType_i32:
-      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
-      break;
-    case IceType_i64:
-      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
-      break;
-    case IceType_f32:
-    case IceType_f64:
-      // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
-      // the fstp instruction.
-      break;
-    case IceType_v4i1:
-    case IceType_v8i1:
-    case IceType_v16i1:
-    case IceType_v16i8:
-    case IceType_v8i16:
-    case IceType_v4i32:
-    case IceType_v4f32:
-      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_xmm0);
-      break;
-    }
-  }
-  Operand *CallTarget = legalize(Instr->getCallTarget());
-  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
-  if (NeedSandboxing) {
-    if (llvm::isa<Constant>(CallTarget)) {
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-    } else {
-      Variable *CallTargetVar = nullptr;
-      _mov(CallTargetVar, CallTarget);
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-      const SizeT BundleSize =
-          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
-      CallTarget = CallTargetVar;
-    }
-  }
-  Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
-  Context.insert(NewCall);
-  if (NeedSandboxing)
-    _bundle_unlock();
-  if (ReturnRegHi)
-    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
-
-  // Add the appropriate offset to esp.  The call instruction takes care
-  // of resetting the stack offset during emission.
-  if (ParameterAreaSizeBytes) {
-    Variable *esp =
-        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
-    _add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
-  }
-
-  // Insert a register-kill pseudo instruction.
-  Context.insert(InstFakeKill::create(Func, NewCall));
-
-  // Generate a FakeUse to keep the call live if necessary.
-  if (Instr->hasSideEffects() && ReturnReg) {
-    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
-    Context.insert(FakeUse);
-  }
-
-  if (!Dest)
-    return;
-
-  // Assign the result of the call to Dest.
-  if (ReturnReg) {
-    if (ReturnRegHi) {
-      assert(Dest->getType() == IceType_i64);
-      split64(Dest);
-      Variable *DestLo = Dest->getLo();
-      Variable *DestHi = Dest->getHi();
-      _mov(DestLo, ReturnReg);
-      _mov(DestHi, ReturnRegHi);
-    } else {
-      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-             isVectorType(Dest->getType()));
-      if (isVectorType(Dest->getType())) {
-        _movp(Dest, ReturnReg);
-      } else {
-        _mov(Dest, ReturnReg);
-      }
-    }
-  } else if (isScalarFloatingType(Dest->getType())) {
-    // Special treatment for an FP function which returns its result in
-    // st(0).
-    // If Dest ends up being a physical xmm register, the fstp emit code
-    // will route st(0) through a temporary stack slot.
-    _fstp(Dest);
-    // Create a fake use of Dest in case it actually isn't used,
-    // because st(0) still needs to be popped.
-    Context.insert(InstFakeUse::create(Func, Dest));
-  }
-}
-
-template <class Machine>
 void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
   InstCast::OpKind CastKind = Inst->getCastKind();