Subzero. Native 64-bit int arithmetic on x86-64.

This CL modifies the x86 instruction selection template to allow native
64-bit GPR support. It also enables x86-64 crosstests.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1273153002.
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 9d872d2..c34b776 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -243,9 +243,9 @@
 
   // Cross Xmm/GPR cast instructions.
   template <typename DReg_t, typename SReg_t> struct CastEmitterRegOp {
-    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, SReg_t);
+    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, Type, SReg_t);
     typedef void (AssemblerX86Base::*TypedEmitAddr)(
-        Type, DReg_t, const typename Traits::Address &);
+        Type, DReg_t, Type, const typename Traits::Address &);
 
     TypedEmitRegs RegReg;
     TypedEmitAddr RegAddr;
@@ -299,7 +299,14 @@
            typename Traits::GPRRegister src);
   void mov(Type Ty, const typename Traits::Address &dst, const Immediate &imm);
 
-  void movFromAh(const typename Traits::GPRRegister dst);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister Dst, uint64_t Imm64);
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister, uint64_t) {
+    llvm::report_fatal_error("movabs is only supported in 64-bit x86 targets.");
+  }
 
   void movzx(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::GPRRegister src);
@@ -328,11 +335,13 @@
   void movss(Type Ty, typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
 
-  void movd(typename Traits::XmmRegister dst, typename Traits::GPRRegister src);
-  void movd(typename Traits::XmmRegister dst,
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
+            typename Traits::GPRRegister src);
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
             const typename Traits::Address &src);
-  void movd(typename Traits::GPRRegister dst, typename Traits::XmmRegister src);
-  void movd(const typename Traits::Address &dst,
+  void movd(Type DestTy, typename Traits::GPRRegister dst,
+            typename Traits::XmmRegister src);
+  void movd(Type DestTy, const typename Traits::Address &dst,
             typename Traits::XmmRegister src);
 
   void movq(typename Traits::XmmRegister dst, typename Traits::XmmRegister src);
@@ -504,9 +513,9 @@
   void cvttps2dq(Type, typename Traits::XmmRegister dst,
                  const typename Traits::Address &src);
 
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                 typename Traits::GPRRegister src);
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                 const typename Traits::Address &src);
 
   void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
@@ -514,9 +523,9 @@
   void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
                       const typename Traits::Address &src);
 
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                  typename Traits::XmmRegister src);
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                  const typename Traits::Address &src);
 
   void ucomiss(Type Ty, typename Traits::XmmRegister a,
@@ -719,6 +728,12 @@
   void cbw();
   void cwd();
   void cdq();
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type cqo();
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type cqo() {
+    llvm::report_fatal_error("CQO is only available in 64-bit x86 backends.");
+  }
 
   void div(Type Ty, typename Traits::GPRRegister reg);
   void div(Type Ty, const typename Traits::Address &address);
@@ -936,7 +951,7 @@
                      typename Traits::GPRRegister>::value;
 
     return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
-           isByteSizedArithType(Ty);
+           isByteSizedType(Ty);
   };
 
   // assembleAndEmitRex is used for determining which (if any) rex prefix should
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index f785756..2cb039a 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -207,6 +207,8 @@
     emitUint8(0xB0 + gprEncoding(dst));
     emitUint8(imm.value() & 0xFF);
   } else {
+    // TODO(jpp): When removing the assertion above ensure that in x86-64 we
+    // emit a 64-bit immediate.
     emitUint8(0xB8 + gprEncoding(dst));
     emitImmediate(Ty, imm);
   }
@@ -279,9 +281,34 @@
 }
 
 template <class Machine>
+template <typename T>
+typename std::enable_if<T::Is64Bit, void>::type
+AssemblerX86Base<Machine>::movabs(const typename Traits::GPRRegister Dst,
+                                  uint64_t Imm64) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  const bool NeedsRexW = (Imm64 & ~0xFFFFFFFFull) != 0;
+  const Type RexType = NeedsRexW ? RexTypeForceRexW : RexTypeIrrelevant;
+  emitRexB(RexType, Dst);
+  emitUint8(0xB8 | gprEncoding(Dst));
+  // When emitting Imm64, we don't have to mask out the upper 32 bits for
+  // emitInt32 will/should only emit a 32-bit constant. In reality, we are
+  // paranoid, so we go ahead an mask the upper bits out anyway.
+  emitInt32(Imm64 & 0xFFFFFFFF);
+  if (NeedsRexW)
+    emitInt32((Imm64 >> 32) & 0xFFFFFFFF);
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::movzx(Type SrcTy,
                                       typename Traits::GPRRegister dst,
                                       typename Traits::GPRRegister src) {
+  if (Traits::Is64Bit && SrcTy == IceType_i32) {
+    // 32-bit mov clears the upper 32 bits, hence zero-extending the 32-bit
+    // operand to 64-bit.
+    mov(IceType_i32, dst, src);
+    return;
+  }
+
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
@@ -295,6 +322,13 @@
 void AssemblerX86Base<Machine>::movzx(Type SrcTy,
                                       typename Traits::GPRRegister dst,
                                       const typename Traits::Address &src) {
+  if (Traits::Is64Bit && SrcTy == IceType_i32) {
+    // 32-bit mov clears the upper 32 bits, hence zero-extending the 32-bit
+    // operand to 64-bit.
+    mov(IceType_i32, dst, src);
+    return;
+  }
+
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
@@ -359,7 +393,7 @@
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   else
-    assert(Ty == IceType_i32);
+    assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
@@ -375,7 +409,7 @@
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   else
-    assert(Ty == IceType_i32);
+    assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
@@ -423,44 +457,48 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::XmmRegister dst,
+void AssemblerX86Base<Machine>::movd(Type SrcTy,
+                                     typename Traits::XmmRegister dst,
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(SrcTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x6E);
   emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::XmmRegister dst,
+void AssemblerX86Base<Machine>::movd(Type SrcTy,
+                                     typename Traits::XmmRegister dst,
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(SrcTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x6E);
   emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::GPRRegister dst,
+void AssemblerX86Base<Machine>::movd(Type DestTy,
+                                     typename Traits::GPRRegister dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRexRB(RexTypeIrrelevant, src, dst);
+  emitRexRB(DestTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
   emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(const typename Traits::Address &dst,
+void AssemblerX86Base<Machine>::movd(Type DestTy,
+                                     const typename Traits::Address &dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRex(RexTypeIrrelevant, dst, src);
+  emitRex(DestTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
   emitOperand(gprEncoding(src), dst);
@@ -1343,7 +1381,7 @@
   // Load 32-bit immediate value into tmp1.
   mov(IceType_i32, tmp1, imm);
   // Move value from tmp1 into dst.
-  movd(dst, tmp1);
+  movd(IceType_i32, dst, tmp1);
   // Broadcast low lane into other three lanes.
   shufps(RexTypeIrrelevant, dst, dst, Immediate(0x0));
 }
@@ -1487,10 +1525,11 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::cvtsi2ss(Type DestTy,
                                          typename Traits::XmmRegister dst,
+                                         Type SrcTy,
                                          typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(SrcTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2A);
   emitXmmRegisterOperand(dst, src);
@@ -1499,10 +1538,11 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::cvtsi2ss(Type DestTy,
                                          typename Traits::XmmRegister dst,
+                                         Type SrcTy,
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(SrcTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2A);
   emitOperand(gprEncoding(dst), src);
@@ -1534,24 +1574,26 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::cvttss2si(Type SrcTy,
+void AssemblerX86Base<Machine>::cvttss2si(Type DestTy,
                                           typename Traits::GPRRegister dst,
+                                          Type SrcTy,
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(DestTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::cvttss2si(Type SrcTy,
+void AssemblerX86Base<Machine>::cvttss2si(Type DestTy,
                                           typename Traits::GPRRegister dst,
+                                          Type SrcTy,
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(DestTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitOperand(gprEncoding(dst), src);
@@ -2401,6 +2443,15 @@
 }
 
 template <class Machine>
+template <typename T>
+typename std::enable_if<T::Is64Bit, void>::type
+AssemblerX86Base<Machine>::cqo() {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeForceRexW, RexRegIrrelevant);
+  emitUint8(0x99);
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::div(Type Ty, typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
@@ -2459,7 +2510,8 @@
 void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  assert(Ty == IceType_i16 || Ty == IceType_i32 ||
+         (Traits::Is64Bit && Ty == IceType_i64));
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   emitRexRB(Ty, dst, src);
@@ -2472,7 +2524,8 @@
 void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister reg,
                                      const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  assert(Ty == IceType_i16 || Ty == IceType_i32 ||
+         (Traits::Is64Bit && Ty == IceType_i64));
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   emitRex(Ty, address, reg);
@@ -2790,8 +2843,7 @@
 void AssemblerX86Base<Machine>::bswap(Type Ty,
                                       typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i32);
-  (void)Ty;
+  assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRexB(Ty, reg);
   emitUint8(0x0F);
   emitUint8(0xC8 | gprEncoding(reg));
diff --git a/src/IceELFSection.h b/src/IceELFSection.h
index 5cf89a5..961d8d2 100644
--- a/src/IceELFSection.h
+++ b/src/IceELFSection.h
@@ -362,8 +362,7 @@
       llvm::report_fatal_error("Missing symbol mentioned in reloc");
 
     if (IsELF64) {
-      llvm_unreachable(
-          "Not tested -- check that Fixup.offset() is correct even for pc-rel");
+      // TODO(jpp): check that Fixup.offset() is correct even for pc-rel.
       Elf64_Rela Rela;
       Rela.r_offset = Fixup.position();
       Rela.setSymbolAndType(Symbol->getNumber(), Fixup.kind());
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index c6d6abf..3a56e1b 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -206,7 +206,7 @@
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
       Disp = CR->getOffset();
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Fixup = Asm->createFixup(RelFixup, CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX8664.cpp b/src/IceInstX8664.cpp
index 3709180..49dc9d8 100644
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -179,8 +179,8 @@
       Disp = static_cast<int32_t>(CI->getValue());
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp = CR->getOffset();
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Disp = CR->getOffset() - 4;
+      Fixup = Asm->createFixup(PcRelFixup, CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 6d39005..b0eb1ad 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -1100,6 +1100,8 @@
     : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movsx> {
 public:
   static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
     return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
   }
 
@@ -1116,6 +1118,8 @@
     : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movzx> {
 public:
   static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
     return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
   }
 
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 34417cf..4d26210 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -729,7 +729,8 @@
   } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.GPRImm))(Ty, VarReg, Immediate(Imm->getValue()));
   } else if (const auto Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(InstX86Base<Machine>::Traits::RelFixup, Reloc);
     (Asm->*(Emitter.GPRImm))(Ty, VarReg, Immediate(Reloc->getOffset(), Fixup));
   } else if (const auto Split = llvm::dyn_cast<
                  typename InstX86Base<Machine>::Traits::VariableSplit>(Src)) {
@@ -758,7 +759,8 @@
   } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.AddrImm))(Ty, Addr, Immediate(Imm->getValue()));
   } else if (const auto Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(InstX86Base<Machine>::Traits::RelFixup, Reloc);
     (Asm->*(Emitter.AddrImm))(Ty, Addr, Immediate(Reloc->getOffset(), Fixup));
   } else {
     llvm_unreachable("Unexpected operand type");
@@ -929,8 +931,8 @@
 
 template <class Machine, typename DReg_t, typename SReg_t,
           DReg_t (*destEnc)(int32_t), SReg_t (*srcEnc)(int32_t)>
-void emitIASCastRegOp(const Cfg *Func, Type DispatchTy, const Variable *Dest,
-                      const Operand *Src,
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
                       const typename InstX86Base<Machine>::Traits::Assembler::
                           template CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
@@ -940,18 +942,18 @@
   if (const auto SrcVar = llvm::dyn_cast<Variable>(Src)) {
     if (SrcVar->hasReg()) {
       SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegReg))(DispatchTy, DestReg, SrcReg);
+      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
     } else {
       typename InstX86Base<Machine>::Traits::Address SrcStackAddr =
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddr))(DispatchTy, DestReg, SrcStackAddr);
+      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
     }
   } else if (const auto Mem = llvm::dyn_cast<
                  typename InstX86Base<Machine>::Traits::X86OperandMem>(Src)) {
     Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddr))(DispatchTy, DestReg, Mem->toAsmAddress(Asm));
+    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, Mem->toAsmAddress(Asm));
   } else {
     llvm_unreachable("Unexpected operand type");
   }
@@ -1387,17 +1389,26 @@
   case IceType_i8:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    Str << "\tcbtw";
+    Str << "\t"
+        << "cbtw";
     break;
   case IceType_i16:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
-    Str << "\tcwtd";
+    Str << "\t"
+        << "cwtd";
     break;
   case IceType_i32:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
-    Str << "\tcltd";
+    Str << "\t"
+        << "cltd";
+    break;
+  case IceType_i64:
+    assert(this->getDest()->getRegNum() ==
+           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    Str << "\t"
+        << "cdto";
     break;
   }
 }
@@ -1430,6 +1441,11 @@
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cdq();
     break;
+  case IceType_i64:
+    assert(this->getDest()->getRegNum() ==
+           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    Asm->cqo();
+    break;
   }
 }
 
@@ -1592,7 +1608,8 @@
   assert(this->getSrcSize() == 2);
   Operand *Src = this->getSrc(1);
   Type SrcTy = Src->getType();
-  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32);
+  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 ||
+         (InstX86Base<Machine>::Traits::Is64Bit));
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
@@ -1814,7 +1831,11 @@
   switch (Variant) {
   case Si2ss: {
     assert(isScalarIntegerType(SrcTy));
-    assert(typeWidthInBytes(SrcTy) <= 4);
+    if (!InstX86Base<Machine>::Traits::Is64Bit) {
+      assert(typeWidthInBytes(SrcTy) <= 4);
+    } else {
+      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
+    }
     assert(isScalarFloatingType(DestTy));
     static const typename InstX86Base<Machine>::Traits::Assembler::
         template CastEmitterRegOp<
@@ -1828,13 +1849,17 @@
         typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR>(
-        Func, DestTy, Dest, Src, Emitter);
+        Func, DestTy, Dest, SrcTy, Src, Emitter);
     return;
   }
   case Tss2si: {
     assert(isScalarFloatingType(SrcTy));
     assert(isScalarIntegerType(DestTy));
-    assert(typeWidthInBytes(DestTy) <= 4);
+    if (!InstX86Base<Machine>::Traits::Is64Bit) {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    } else {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    }
     static const typename InstX86Base<Machine>::Traits::Assembler::
         template CastEmitterRegOp<
             typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
@@ -1847,7 +1872,7 @@
         typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm>(
-        Func, SrcTy, Dest, Src, Emitter);
+        Func, DestTy, Dest, SrcTy, Src, Emitter);
     return;
   }
   case Float2float: {
@@ -2244,6 +2269,10 @@
   this->getDest()->emit(Func);
 }
 
+inline bool isIntegerConstant(const Operand *Op) {
+  return llvm::isa<ConstantInteger32>(Op) || llvm::isa<ConstantInteger64>(Op);
+}
+
 template <class Machine> void InstX86Mov<Machine>::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -2252,11 +2281,16 @@
   Operand *Src = this->getSrc(0);
   Type SrcTy = Src->getType();
   Type DestTy = this->getDest()->getType();
-  Str << "\tmov"
-      << (!isScalarFloatingType(DestTy)
-              ? this->getWidthString(SrcTy)
-              : InstX86Base<Machine>::Traits::TypeAttributes[DestTy].SdSsString)
-      << "\t";
+  if (InstX86Base<Machine>::Traits::Is64Bit && DestTy == IceType_i64 &&
+      isIntegerConstant(Src)) {
+    Str << "\tmovabs\t";
+  } else {
+    Str << "\tmov"
+        << (!isScalarFloatingType(DestTy)
+                ? this->getWidthString(SrcTy)
+                : InstX86Base<Machine>::Traits::TypeAttributes[DestTy]
+                      .SdSsString) << "\t";
+  }
   // For an integer truncation operation, src is wider than dest.
   // Ideally, we use a mov instruction whose data width matches the
   // narrower dest.  This is a problem if e.g. src is a register like
@@ -2320,6 +2354,20 @@
       assert(isScalarIntegerType(DestTy));
       // Widen DestTy for truncation (see above note). We should only do this
       // when both Src and Dest are integer types.
+      if (InstX86Base<Machine>::Traits::Is64Bit && DestTy == IceType_i64 &&
+          isIntegerConstant(Src)) {
+        uint64_t Value = -1;
+        if (const auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src)) {
+          Value = C64->getValue();
+        } else {
+          Value = llvm::cast<ConstantInteger32>(Src)->getValue();
+        }
+        Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>()
+            ->movabs(InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+                         Dest->getRegNum()),
+                     Value);
+        return;
+      }
       if (isScalarIntegerType(SrcTy)) {
         DestTy = SrcTy;
       }
@@ -2363,14 +2411,19 @@
   const auto SrcVar = llvm::cast<Variable>(this->getSrc(0));
   // For insert/extract element (one of Src/Dest is an Xmm vector and
   // the other is an int type).
-  if (SrcVar->getType() == IceType_i32) {
-    assert(isVectorType(Dest->getType()));
+  if (SrcVar->getType() == IceType_i32 ||
+      (InstX86Base<Machine>::Traits::Is64Bit &&
+       SrcVar->getType() == IceType_i64)) {
+    assert(isVectorType(Dest->getType()) ||
+           (isScalarFloatingType(Dest->getType()) &&
+            typeWidthInBytes(SrcVar->getType()) ==
+                typeWidthInBytes(Dest->getType())));
     assert(Dest->hasReg());
     typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister DestReg =
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm(
             Dest->getRegNum());
     if (SrcVar->hasReg()) {
-      Asm->movd(DestReg,
+      Asm->movd(SrcVar->getType(), DestReg,
                 InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
                     SrcVar->getRegNum()));
     } else {
@@ -2378,17 +2431,23 @@
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(SrcVar));
-      Asm->movd(DestReg, StackAddr);
+      Asm->movd(SrcVar->getType(), DestReg, StackAddr);
     }
   } else {
-    assert(isVectorType(SrcVar->getType()));
+    assert(isVectorType(SrcVar->getType()) ||
+           (isScalarFloatingType(SrcVar->getType()) &&
+            typeWidthInBytes(SrcVar->getType()) ==
+                typeWidthInBytes(Dest->getType())));
     assert(SrcVar->hasReg());
-    assert(Dest->getType() == IceType_i32);
+    assert(Dest->getType() == IceType_i32 ||
+           (InstX86Base<Machine>::Traits::Is64Bit &&
+            Dest->getType() == IceType_i64));
     typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister SrcReg =
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm(
             SrcVar->getRegNum());
     if (Dest->hasReg()) {
-      Asm->movd(InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+      Asm->movd(Dest->getType(),
+                InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
                     Dest->getRegNum()),
                 SrcReg);
     } else {
@@ -2396,7 +2455,7 @@
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(Dest));
-      Asm->movd(StackAddr, SrcReg);
+      Asm->movd(Dest->getType(), StackAddr, SrcReg);
     }
   }
 }
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6724a61..466564d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -792,7 +792,7 @@
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, llvm::ELF::R_386_32);
+      Writer->writeJumpTable(JT, TargetX8632::Traits::RelFixup);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -821,7 +821,8 @@
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8632::Traits::RelFixup,
+                             SectionSuffix);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 29066aa..e0acbd6 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -68,6 +68,7 @@
   static const GPRRegister Encoded_Reg_Accumulator = RegX8632::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8632::Encoded_Reg_ecx;
   static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_386_32;
 
   class Operand {
   public:
@@ -272,6 +273,7 @@
   };
 
   static const char *TargetName;
+  static constexpr Type WordType = IceType_i32;
 
   static IceString getRegName(SizeT RegNum, Type Ty) {
     assert(RegNum < RegisterSet::Reg_NUM);
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 9056648..41d24cc 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -123,7 +123,7 @@
 }
 
 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
-// OperandList in lowerCall. std::max() was supposed to work, but it doesn't.
+// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
 constexpr SizeT constexprMax(SizeT S0, SizeT S1) { return S0 < S1 ? S1 : S0; }
 
 } // end of anonymous namespace
@@ -239,7 +239,6 @@
   Variable *Dest = Instr->getDest();
   // ReturnReg doubles as ReturnRegLo as necessary.
   Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
   if (Dest) {
     switch (Dest->getType()) {
     case IceType_NUM:
@@ -250,12 +249,8 @@
     case IceType_i8:
     case IceType_i16:
     case IceType_i32:
-      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
-      break;
     case IceType_i64:
-      // TODO(jpp): return i64 in a GPR.
-      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
       break;
     case IceType_f32:
     case IceType_f64:
@@ -271,27 +266,16 @@
     }
   }
 
-  Operand *CallTarget = legalize(Instr->getCallTarget());
+  Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm);
   const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
   if (NeedSandboxing) {
-    if (llvm::isa<Constant>(CallTarget)) {
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-    } else {
-      Variable *CallTargetVar = nullptr;
-      _mov(CallTargetVar, CallTarget);
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-      const SizeT BundleSize =
-          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
-      CallTarget = CallTargetVar;
-    }
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
   }
   Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
   Context.insert(NewCall);
-  if (NeedSandboxing)
-    _bundle_unlock();
-  if (ReturnRegHi)
-    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+  if (NeedSandboxing) {
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
+  }
 
   // Add the appropriate offset to esp.  The call instruction takes care
   // of resetting the stack offset during emission.
@@ -315,25 +299,11 @@
 
   assert(ReturnReg && "x86-64 always returns value on registers.");
 
-  // Assign the result of the call to Dest.
-  if (ReturnRegHi) {
-    assert(Dest->getType() == IceType_i64);
-    split64(Dest);
-    Variable *DestLo = Dest->getLo();
-    Variable *DestHi = Dest->getHi();
-    _mov(DestLo, ReturnReg);
-    _mov(DestHi, ReturnRegHi);
-    return;
-  }
-
-  assert(Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64 ||
-         Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-         Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-         isVectorType(Dest->getType()));
-
-  if (isScalarFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
+  if (isVectorType(Dest->getType())) {
     _movp(Dest, ReturnReg);
   } else {
+    assert(isScalarFloatingType(Dest->getType()) ||
+           isScalarIntegerType(Dest->getType()));
     _mov(Dest, ReturnReg);
   }
 }
@@ -356,36 +326,36 @@
        ++i) {
     Variable *Arg = Args[i];
     Type Ty = Arg->getType();
-    if ((isVectorType(Ty) || isScalarFloatingType(Ty)) &&
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      // Replace Arg in the argument list with the home register.  Then
-      // generate an instruction in the prolog to copy the home register
-      // to the assigned location of Arg.
-      int32_t RegNum = getRegisterForXmmArgNum(NumXmmArgs);
+    Variable *RegisterArg = nullptr;
+    int32_t RegNum = Variable::NoRegister;
+    if ((isVectorType(Ty) || isScalarFloatingType(Ty))) {
+      if (NumXmmArgs >= Traits::X86_MAX_XMM_ARGS) {
+        continue;
+      }
+      RegNum = getRegisterForXmmArgNum(NumXmmArgs);
       ++NumXmmArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-      RegisterArg->setRegNum(RegNum);
-      RegisterArg->setIsArg();
-      Arg->setIsArg(false);
-
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
-    } else if (isScalarIntegerType(Ty) &&
-               NumGprArgs < Traits::X86_MAX_GPR_ARGS) {
-      int32_t RegNum = getRegisterForGprArgNum(NumGprArgs);
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarIntegerType(Ty)) {
+      if (NumGprArgs >= Traits::X86_MAX_GPR_ARGS) {
+        continue;
+      }
+      RegNum = getRegisterForGprArgNum(NumGprArgs);
       ++NumGprArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-      RegisterArg->setRegNum(RegNum);
-      RegisterArg->setIsArg();
-      Arg->setIsArg(false);
-
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+      RegisterArg = Func->makeVariable(Ty);
     }
+    assert(RegNum != Variable::NoRegister);
+    assert(RegisterArg != nullptr);
+    // Replace Arg in the argument list with the home register.  Then
+    // generate an instruction in the prolog to copy the home register
+    // to the assigned location of Arg.
+    if (BuildDefs::dump())
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+
+    Args[i] = RegisterArg;
+    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
   }
 }
 
@@ -393,19 +363,11 @@
   Variable *Reg = nullptr;
   if (Inst->hasRetValue()) {
     Operand *Src0 = legalize(Inst->getRetValue());
-    // TODO(jpp): this is not needed.
-    if (Src0->getType() == IceType_i64) {
-      Variable *eax =
-          legalizeToReg(loOperand(Src0), Traits::RegisterSet::Reg_eax);
-      Variable *edx =
-          legalizeToReg(hiOperand(Src0), Traits::RegisterSet::Reg_edx);
-      Reg = eax;
-      Context.insert(InstFakeUse::create(Func, edx));
-    } else if (isScalarFloatingType(Src0->getType())) {
-      _fld(Src0);
-    } else if (isVectorType(Src0->getType())) {
+    if (isVectorType(Src0->getType()) ||
+        isScalarFloatingType(Src0->getType())) {
       Reg = legalizeToReg(Src0, Traits::RegisterSet::Reg_xmm0);
     } else {
+      assert(isScalarIntegerType(Src0->getType()));
       _mov(Reg, Src0, Traits::RegisterSet::Reg_eax);
     }
   }
@@ -577,19 +539,17 @@
   unsigned NumGPRArgs = 0;
   for (Variable *Arg : Args) {
     // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType()) && NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      ++NumXmmArgs;
-      continue;
-    }
-    if (isScalarFloatingType(Arg->getType()) &&
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      ++NumXmmArgs;
-      continue;
-    }
-    if (isScalarIntegerType(Arg->getType()) &&
-        NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
-      ++NumGPRArgs;
-      continue;
+    if (isVectorType(Arg->getType()) || isScalarFloatingType(Arg->getType())) {
+      if (NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else {
+      assert(isScalarIntegerType(Arg->getType()));
+      if (NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
+        ++NumGPRArgs;
+        continue;
+      }
     }
     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   }
@@ -679,23 +639,9 @@
     }
   }
 
-  if (!Ctx->getFlags().getUseSandboxing())
-    return;
-  // Change the original ret instruction into a sandboxed return sequence.
-  // t:ecx = pop
-  // bundle_lock
-  // and t, ~31
-  // jmp *t
-  // bundle_unlock
-  // FakeUse <original_ret_operand>
-  Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-  _pop(T_ecx);
-  lowerIndirectJump(T_ecx);
-  if (RI->getSrcSize()) {
-    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
-    Context.insert(InstFakeUse::create(Func, RetValue));
+  if (Ctx->getFlags().getUseSandboxing()) {
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
   }
-  RI->setDeleted();
 }
 
 void TargetX8664::emitJumpTable(const Cfg *Func,
@@ -858,8 +804,7 @@
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JumpTable : Ctx->getJumpTables())
-      // TODO(jpp): not 386.
-      Writer->writeJumpTable(JumpTable, llvm::ELF::R_386_32);
+      Writer->writeJumpTable(JumpTable, TargetX8664::Traits::RelFixup);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -888,8 +833,8 @@
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    // TODO(jpp): not 386.
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8664::Traits::RelFixup,
+                             SectionSuffix);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 89fc203..4a12004 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -66,7 +66,8 @@
   using RegisterSet = ::Ice::RegX8664;
   static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
-  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32; // TODO(jpp): ???
+  static const FixupKind PcRelFixup = llvm::ELF::R_X86_64_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32S;
 
   class Operand {
   public:
@@ -270,8 +271,8 @@
 
     static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
       // TODO(jpp): ???
-      AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Imm);
-      const RelocOffsetT Offset = 0;
+      AssemblerFixup *Fixup = Asm->createFixup(RelFixup, Imm);
+      const RelocOffsetT Offset = 4;
       return Address(ABSOLUTE, Offset, Fixup);
     }
   };
@@ -293,6 +294,7 @@
   };
 
   static const char *TargetName;
+  static constexpr Type WordType = IceType_i64;
 
   static IceString getRegName(SizeT RegNum, Type Ty) {
     assert(RegNum < RegisterSet::Reg_NUM);
@@ -331,7 +333,7 @@
 #define X(val, encode, name64, name32, name16, name8, scratch, preserved,      \
           stackptr, frameptr, isInt, isFP)                                     \
   (*IntegerRegisters)[RegisterSet::val] = isInt;                               \
-  (*IntegerRegistersI8)[RegisterSet::val] = 1;                                 \
+  (*IntegerRegistersI8)[RegisterSet::val] = isInt;                             \
   (*FloatRegisters)[RegisterSet::val] = isFP;                                  \
   (*VectorRegisters)[RegisterSet::val] = isFP;                                 \
   (*ScratchRegs)[RegisterSet::val] = scratch;
@@ -450,7 +452,7 @@
   /// address.
   static const uint32_t X86_STACK_ALIGNMENT_BYTES;
   /// Size of the return address on the stack
-  static const uint32_t X86_RET_IP_SIZE_BYTES = 4;
+  static const uint32_t X86_RET_IP_SIZE_BYTES = 8;
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 342c97b..da863f4 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -21,6 +21,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceUtils.h"
 
 #include <type_traits>
 #include <utility>
@@ -80,10 +81,9 @@
                            : Traits::RegisterSet::Reg_esp;
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of 4 bytes.  In particular, i1,
-    // i8, and i16 are rounded up to 4 bytes.
-    // TODO(jpp): this needs to round to multiples of 8 bytes in x86-64.
-    return (typeWidthInBytes(Ty) + 3) & ~3;
+    // Round up to the next multiple of WordType bytes.
+    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
+    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
   }
 
   SizeT getMinJumpTableSize() const override { return 4; }
@@ -98,14 +98,40 @@
   void emit(const ConstantDouble *C) const final;
 
   void initNodeForLowering(CfgNode *Node) override;
-  /// Ensure that a 64-bit Variable has been split into 2 32-bit
+  /// x86-32: Ensure that a 64-bit Variable has been split into 2 32-bit
   /// Variables, creating them if necessary.  This is needed for all
   /// I64 operations, and it is needed for pushing F64 arguments for
   /// function calls using the 32-bit push instruction (though the
   /// latter could be done by directly writing to the stack).
-  void split64(Variable *Var);
-  Operand *loOperand(Operand *Operand);
-  Operand *hiOperand(Operand *Operand);
+  ///
+  /// x86-64: Complains loudly if invoked because the cpu can handle
+  /// 64-bit types natively.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type split64(Variable *Var);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type split64(Variable *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (split64)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  loOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  hiOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
+  }
+
   void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                               size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   typename Traits::Address stackVarToAsmOperand(const Variable *Var) const;
@@ -128,6 +154,19 @@
   void lowerExtractElement(const InstExtractElement *Inst) override;
   void lowerFcmp(const InstFcmp *Inst) override;
   void lowerIcmp(const InstIcmp *Inst) override;
+  /// Complains loudly if invoked because the cpu can handle 64-bit types
+  /// natively.
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
+  }
+  /// x86lowerIcmp64 handles 64-bit icmp lowering.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *Inst);
+
   void lowerIntrinsicCall(const InstIntrinsicCall *Inst) override;
   void lowerInsertElement(const InstInsertElement *Inst) override;
   void lowerLoad(const InstLoad *Inst) override;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 8dad58e..e190b5d 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -77,6 +77,7 @@
 public:
   enum BoolFoldingProducerKind {
     PK_None,
+    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
     PK_Icmp32,
     PK_Icmp64,
     PK_Fcmp,
@@ -120,7 +121,7 @@
 typename BoolFolding<MachineTraits>::BoolFoldingProducerKind
 BoolFolding<MachineTraits>::getProducerKind(const Inst *Instr) {
   if (llvm::isa<InstIcmp>(Instr)) {
-    if (Instr->getSrc(0)->getType() != IceType_i64)
+    if (MachineTraits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
       return PK_Icmp32;
     return PK_None; // TODO(stichnot): actually PK_Icmp64;
   }
@@ -643,10 +644,10 @@
       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
         // An AtomicLoad intrinsic qualifies as long as it has a valid
         // memory ordering, and can be implemented in a single
-        // instruction (i.e., not i64).
+        // instruction (i.e., not i64 on x86-32).
         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
         if (ID == Intrinsics::AtomicLoad &&
-            Intrin->getDest()->getType() != IceType_i64 &&
+            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
             Intrinsics::isMemoryOrderValid(
                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
           LoadDest = Intrin->getDest();
@@ -724,6 +725,10 @@
 
 template <class Machine>
 Variable *TargetX86Base<Machine>::getPhysicalRegister(SizeT RegNum, Type Ty) {
+  // Special case: never allow partial reads/writes to/from %rBP and %rSP.
+  if (RegNum == Traits::RegisterSet::Reg_esp ||
+      RegNum == Traits::RegisterSet::Reg_ebp)
+    Ty = Traits::WordType;
   if (Ty == IceType_void)
     Ty = IceType_i32;
   if (PhysicalRegisters[Ty].empty())
@@ -770,7 +775,7 @@
   }
   if (Offset)
     Str << Offset;
-  const Type FrameSPTy = IceType_i32;
+  const Type FrameSPTy = Traits::WordType;
   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
 }
 
@@ -810,8 +815,7 @@
   Variable *Lo = Arg->getLo();
   Variable *Hi = Arg->getHi();
   Type Ty = Arg->getType();
-  if (Lo && Hi && Ty == IceType_i64) {
-    // TODO(jpp): This special case is not needed for x86-64.
+  if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) {
     assert(Lo->getType() != IceType_i64); // don't want infinite recursion
     assert(Hi->getType() != IceType_i64); // don't want infinite recursion
     finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
@@ -824,7 +828,7 @@
   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   if (Arg->hasReg()) {
-    assert(Ty != IceType_i64);
+    assert(Ty != IceType_i64 || Traits::Is64Bit);
     typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create(
         Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
     if (isVectorType(Arg->getType())) {
@@ -840,11 +844,13 @@
 }
 
 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() {
-  // TODO(jpp): this is wrong for x86-64.
-  return IceType_i32;
+  return Traits::WordType;
 }
 
-template <class Machine> void TargetX86Base<Machine>::split64(Variable *Var) {
+template <class Machine>
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX86Base<Machine>::split64(Variable *Var) {
   switch (Var->getType()) {
   default:
     return;
@@ -876,7 +882,9 @@
 }
 
 template <class Machine>
-Operand *TargetX86Base<Machine>::loOperand(Operand *Operand) {
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX86Base<Machine>::loOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64 ||
          Operand->getType() == IceType_f64);
   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
@@ -905,7 +913,9 @@
 }
 
 template <class Machine>
-Operand *TargetX86Base<Machine>::hiOperand(Operand *Operand) {
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX86Base<Machine>::hiOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64 ||
          Operand->getType() == IceType_f64);
   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
@@ -1107,8 +1117,8 @@
     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
       std::swap(Src0, Src1);
   }
-  if (Dest->getType() == IceType_i64) {
-    // These helper-call-involved instructions are lowered in this
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    // These x86-32 helper-call-involved instructions are lowered in this
     // separate switch. This is because loOperand() and hiOperand()
     // may insert redundant instructions for constant blinding and
     // pooling. Such redundant instructions will fail liveness analysis
@@ -1656,7 +1666,8 @@
       Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx);
+      T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx);
+      _mov(T_edx, Zero);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
       _div(T_edx, Src1, T);
       _mov(Dest, T_edx);
@@ -1721,7 +1732,7 @@
       _mov(Dest, T);
       Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
-      T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
       _cbwdq(T_edx, T);
       _idiv(T_edx, Src1, T);
@@ -1765,7 +1776,7 @@
   Variable *Dest = Inst->getDest();
   Operand *Src0 = Inst->getSrc(0);
   assert(Dest->getType() == Src0->getType());
-  if (Dest->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
     Src0 = legalize(Src0);
     Operand *Src0Lo = loOperand(Src0);
     Operand *Src0Hi = hiOperand(Src0);
@@ -1870,7 +1881,7 @@
         _psra(T, ShiftConstant);
         _movp(Dest, T);
       }
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
       Constant *Shift = Ctx->getConstantInt32(31);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1930,7 +1941,7 @@
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // t1=movzx src; dst.lo=t1; dst.hi=0
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1951,13 +1962,16 @@
       // t = Src0RM; t &= 1; Dest = t
       Constant *One = Ctx->getConstantInt32(1);
       Type DestTy = Dest->getType();
-      Variable *T;
+      Variable *T = nullptr;
       if (DestTy == IceType_i8) {
-        T = makeReg(DestTy);
         _mov(T, Src0RM);
       } else {
+        assert(DestTy != IceType_i1);
+        assert(Traits::Is64Bit || DestTy != IceType_i64);
         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
-        T = makeReg(IceType_i32);
+        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
+        // written to the stack (i.e., in -Om1) will be fully zero-extended.
+        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
         _movzx(T, Src0RM);
       }
       _and(T, One);
@@ -1982,7 +1996,7 @@
       _movp(Dest, T);
     } else {
       Operand *Src0 = legalizeUndef(Inst->getSrc(0));
-      if (Src0->getType() == IceType_i64)
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
         Src0 = loOperand(Src0);
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // t1 = trunc Src0RM; Dest = t1
@@ -2013,7 +2027,7 @@
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
       _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // Use a helper for converting floating-point values to 64-bit
       // integers.  SSE2 appears to have no way to convert from xmm
       // registers to something like the edx:eax register pair, and
@@ -2032,7 +2046,15 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Dest->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Dest->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
@@ -2050,14 +2072,18 @@
       Call->addArg(Inst->getSrc(0));
       lowerCall(Call);
     } else if (Dest->getType() == IceType_i64 ||
-               Dest->getType() == IceType_i32) {
+               (!Traits::Is64Bit && Dest->getType() == IceType_i32)) {
       // Use a helper for both x86-32 and x86-64.
-      split64(Dest);
+      if (!Traits::Is64Bit)
+        split64(Dest);
       const SizeT MaxSrcs = 1;
       Type DestType = Dest->getType();
       Type SrcType = Inst->getSrc(0)->getType();
       IceString TargetString;
-      if (isInt32Asserting32Or64(DestType)) {
+      if (Traits::Is64Bit) {
+        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
+                                                         : H_fptoui_f64_i64;
+      } else if (isInt32Asserting32Or64(DestType)) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
                                                          : H_fptoui_f64_i32;
       } else {
@@ -2071,7 +2097,15 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
+      assert(Dest->getType() != IceType_i64);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Dest->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Dest->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
@@ -2090,7 +2124,7 @@
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
       _movp(Dest, T);
-    } else if (Inst->getSrc(0)->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Inst->getSrc(0)->getType() == IceType_i64) {
       // Use a helper for x86-32.
       const SizeT MaxSrcs = 1;
       Type DestType = Dest->getType();
@@ -2106,9 +2140,16 @@
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // Sign-extend the operand.
       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
+      if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
         _movsx(T_1, Src0RM);
@@ -2126,7 +2167,7 @@
       Call->addArg(Src0);
       lowerCall(Call);
     } else if (Src0->getType() == IceType_i64 ||
-               Src0->getType() == IceType_i32) {
+               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
       // Use a helper for x86-32 and x86-64.  Also use a helper for
       // i32 on x86-32.
       const SizeT MaxSrcs = 1;
@@ -2147,9 +2188,17 @@
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // Zero-extend the operand.
       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
+      if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
         _movzx(T_1, Src0RM);
@@ -2205,77 +2254,96 @@
       _mov(Dest, Spill);
     } break;
     case IceType_i64: {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      assert(Src0RM->getType() == IceType_f64);
-      // a.i64 = bitcast b.f64 ==>
-      //   s.f64 = spill b.f64
-      //   t_lo.i32 = lo(s.f64)
-      //   a_lo.i32 = t_lo.i32
-      //   t_hi.i32 = hi(s.f64)
-      //   a_hi.i32 = t_hi.i32
-      Operand *SpillLo, *SpillHi;
-      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
-        typename Traits::SpillVariable *SpillVar =
-            Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
-        SpillVar->setLinkedTo(Src0Var);
-        Variable *Spill = SpillVar;
-        Spill->setWeight(RegWeight::Zero);
-        _movq(Spill, Src0RM);
-        SpillLo = Traits::VariableSplit::create(Func, Spill,
-                                                Traits::VariableSplit::Low);
-        SpillHi = Traits::VariableSplit::create(Func, Spill,
-                                                Traits::VariableSplit::High);
+      assert(Src0->getType() == IceType_f64);
+      if (Traits::Is64Bit) {
+        // Movd requires its fp argument (in this case, the bitcast source) to
+        // be an xmm register.
+        Variable *Src0R = legalizeToReg(Src0);
+        Variable *T = makeReg(IceType_i64);
+        _movd(T, Src0R);
+        _mov(Dest, T);
       } else {
-        SpillLo = loOperand(Src0RM);
-        SpillHi = hiOperand(Src0RM);
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        // a.i64 = bitcast b.f64 ==>
+        //   s.f64 = spill b.f64
+        //   t_lo.i32 = lo(s.f64)
+        //   a_lo.i32 = t_lo.i32
+        //   t_hi.i32 = hi(s.f64)
+        //   a_hi.i32 = t_hi.i32
+        Operand *SpillLo, *SpillHi;
+        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+          typename Traits::SpillVariable *SpillVar =
+              Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
+          SpillVar->setLinkedTo(Src0Var);
+          Variable *Spill = SpillVar;
+          Spill->setWeight(RegWeight::Zero);
+          _movq(Spill, Src0RM);
+          SpillLo = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::Low);
+          SpillHi = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::High);
+        } else {
+          SpillLo = loOperand(Src0RM);
+          SpillHi = hiOperand(Src0RM);
+        }
+
+        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Variable *T_Lo = makeReg(IceType_i32);
+        Variable *T_Hi = makeReg(IceType_i32);
+
+        _mov(T_Lo, SpillLo);
+        _mov(DestLo, T_Lo);
+        _mov(T_Hi, SpillHi);
+        _mov(DestHi, T_Hi);
       }
-
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(IceType_i32);
-      Variable *T_Hi = makeReg(IceType_i32);
-
-      _mov(T_Lo, SpillLo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, SpillHi);
-      _mov(DestHi, T_Hi);
     } break;
     case IceType_f64: {
-      Src0 = legalize(Src0);
       assert(Src0->getType() == IceType_i64);
-      if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
-        Variable *T = Func->makeVariable(Dest->getType());
-        _movq(T, Src0);
-        _movq(Dest, T);
-        break;
-      }
-      // a.f64 = bitcast b.i64 ==>
-      //   t_lo.i32 = b_lo.i32
-      //   FakeDef(s.f64)
-      //   lo(s.f64) = t_lo.i32
-      //   t_hi.i32 = b_hi.i32
-      //   hi(s.f64) = t_hi.i32
-      //   a.f64 = s.f64
-      typename Traits::SpillVariable *SpillVar =
-          Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
-      SpillVar->setLinkedTo(Dest);
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
+      if (Traits::Is64Bit) {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(IceType_f64);
+        // Movd requires its fp argument (in this case, the bitcast destination)
+        // to be an xmm register.
+        T->setWeightInfinite();
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        Src0 = legalize(Src0);
+        if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
+          Variable *T = Func->makeVariable(Dest->getType());
+          _movq(T, Src0);
+          _movq(Dest, T);
+          break;
+        }
+        // a.f64 = bitcast b.i64 ==>
+        //   t_lo.i32 = b_lo.i32
+        //   FakeDef(s.f64)
+        //   lo(s.f64) = t_lo.i32
+        //   t_hi.i32 = b_hi.i32
+        //   hi(s.f64) = t_hi.i32
+        //   a.f64 = s.f64
+        typename Traits::SpillVariable *SpillVar =
+            Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
+        SpillVar->setLinkedTo(Dest);
+        Variable *Spill = SpillVar;
+        Spill->setWeight(RegWeight::Zero);
 
-      Variable *T_Lo = nullptr, *T_Hi = nullptr;
-      typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create(
-          Func, Spill, Traits::VariableSplit::Low);
-      typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create(
-          Func, Spill, Traits::VariableSplit::High);
-      _mov(T_Lo, loOperand(Src0));
-      // Technically, the Spill is defined after the _store happens, but
-      // SpillLo is considered a "use" of Spill so define Spill before it
-      // is used.
-      Context.insert(InstFakeDef::create(Func, Spill));
-      _store(T_Lo, SpillLo);
-      _mov(T_Hi, hiOperand(Src0));
-      _store(T_Hi, SpillHi);
-      _movq(Dest, Spill);
+        Variable *T_Lo = nullptr, *T_Hi = nullptr;
+        typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::Low);
+        typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::High);
+        _mov(T_Lo, loOperand(Src0));
+        // Technically, the Spill is defined after the _store happens, but
+        // SpillLo is considered a "use" of Spill so define Spill before it
+        // is used.
+        Context.insert(InstFakeDef::create(Func, Spill));
+        _store(T_Lo, SpillLo);
+        _mov(T_Hi, hiOperand(Src0));
+        _store(T_Hi, SpillHi);
+        _movq(Dest, Spill);
+      }
     } break;
     case IceType_v8i1: {
       assert(Src0->getType() == IceType_i8);
@@ -2615,32 +2683,8 @@
     return;
   }
 
-  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
-  if (Src0->getType() == IceType_i64) {
-    InstIcmp::ICond Condition = Inst->getCondition();
-    size_t Index = static_cast<size_t>(Condition);
-    assert(Index < Traits::TableIcmp64Size);
-    Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
-    Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
-    Constant *Zero = Ctx->getConstantZero(IceType_i32);
-    Constant *One = Ctx->getConstantInt32(1);
-    typename Traits::Insts::Label *LabelFalse =
-        Traits::Insts::Label::create(Func, this);
-    typename Traits::Insts::Label *LabelTrue =
-        Traits::Insts::Label::create(Func, this);
-    _mov(Dest, One);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Index].C1 != Traits::Cond::Br_None)
-      _br(Traits::TableIcmp64[Index].C1, LabelTrue);
-    if (Traits::TableIcmp64[Index].C2 != Traits::Cond::Br_None)
-      _br(Traits::TableIcmp64[Index].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Index].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    _mov_nonkillable(Dest, Zero);
-    Context.insert(LabelTrue);
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    lowerIcmp64(Inst);
     return;
   }
 
@@ -2650,6 +2694,40 @@
   _setcc(Dest, Traits::getIcmp32Mapping(Inst->getCondition()));
 }
 
+template <typename Machine>
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX86Base<Machine>::lowerIcmp64(const InstIcmp *Inst) {
+  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
+  Operand *Src0 = legalize(Inst->getSrc(0));
+  Operand *Src1 = legalize(Inst->getSrc(1));
+  Variable *Dest = Inst->getDest();
+  InstIcmp::ICond Condition = Inst->getCondition();
+  size_t Index = static_cast<size_t>(Condition);
+  assert(Index < Traits::TableIcmp64Size);
+  Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+  Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
+  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  Constant *One = Ctx->getConstantInt32(1);
+  typename Traits::Insts::Label *LabelFalse =
+      Traits::Insts::Label::create(Func, this);
+  typename Traits::Insts::Label *LabelTrue =
+      Traits::Insts::Label::create(Func, this);
+  _mov(Dest, One);
+  _cmp(Src0HiRM, Src1HiRI);
+  if (Traits::TableIcmp64[Index].C1 != Traits::Cond::Br_None)
+    _br(Traits::TableIcmp64[Index].C1, LabelTrue);
+  if (Traits::TableIcmp64[Index].C2 != Traits::Cond::Br_None)
+    _br(Traits::TableIcmp64[Index].C2, LabelFalse);
+  _cmp(Src0LoRM, Src1LoRI);
+  _br(Traits::TableIcmp64[Index].C3, LabelTrue);
+  Context.insert(LabelFalse);
+  _mov_nonkillable(Dest, Zero);
+  Context.insert(LabelTrue);
+}
+
 template <class Machine>
 void TargetX86Base<Machine>::lowerInsertElement(const InstInsertElement *Inst) {
   Operand *SourceVectNotLegalized = Inst->getSrc(0);
@@ -2848,7 +2926,7 @@
       return;
     }
     Variable *Dest = Instr->getDest();
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // Follow what GCC does and use a movq instead of what lowerLoad()
       // normally does (split the load into two).
       // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
@@ -2898,7 +2976,7 @@
     // Add a fence after the store to make it visible.
     Operand *Value = Instr->getArg(0);
     Operand *Ptr = Instr->getArg(1);
-    if (Value->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
       // Use a movq instead of what lowerStore() normally does
       // (split the store into two), following what GCC does.
       // Cast the bits from int -> to an xmm register first.
@@ -2922,7 +3000,7 @@
     Operand *Val = Instr->getArg(0);
     // In 32-bit mode, bswap only works on 32-bit arguments, and the
     // argument must be a register. Use rotate left for 16-bit bswap.
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       Val = legalizeUndef(Val);
       Variable *T_Lo = legalizeToReg(loOperand(Val));
       Variable *T_Hi = legalizeToReg(hiOperand(Val));
@@ -2932,7 +3010,8 @@
       _bswap(T_Hi);
       _mov(DestLo, T_Hi);
       _mov(DestHi, T_Lo);
-    } else if (Val->getType() == IceType_i32) {
+    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
+               Val->getType() == IceType_i32) {
       Variable *T = legalizeToReg(Val);
       _bswap(T);
       _mov(Dest, T);
@@ -2949,11 +3028,28 @@
   }
   case Intrinsics::Ctpop: {
     Variable *Dest = Instr->getDest();
+    Variable *T = nullptr;
     Operand *Val = Instr->getArg(0);
-    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
-                                        ? H_call_ctpop_i32
-                                        : H_call_ctpop_i64,
-                                    Dest, 1);
+    Type ValTy = Val->getType();
+    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
+
+    if (!Traits::Is64Bit) {
+      T = Dest;
+    } else {
+      T = makeReg(IceType_i64);
+      if (ValTy == IceType_i32) {
+        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
+        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
+        // ensure we will not have any bits set on Val's upper 32 bits.
+        Variable *V = makeReg(IceType_i64);
+        _movzx(V, Val);
+        Val = V;
+      }
+      ValTy = IceType_i64;
+    }
+
+    InstCall *Call = makeHelperCall(
+        ValTy == IceType_i32 ? H_call_ctpop_i32 : H_call_ctpop_i64, T, 1);
     Call->addArg(Val);
     lowerCall(Call);
     // The popcount helpers always return 32-bit values, while the intrinsic's
@@ -2961,10 +3057,33 @@
     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
     // the user doesn't do that in the IR. If the user does that in the IR,
     // then this zero'ing instruction is dead and gets optimized out.
-    if (Val->getType() == IceType_i64) {
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(DestHi, Zero);
+    if (!Traits::Is64Bit) {
+      assert(T == Dest);
+      if (Val->getType() == IceType_i64) {
+        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Constant *Zero = Ctx->getConstantZero(IceType_i32);
+        _mov(DestHi, Zero);
+      }
+    } else {
+      assert(Val->getType() == IceType_i64);
+      // T is 64 bit. It needs to be copied to dest. We need to:
+      //
+      // T_1.32 = trunc T.64 to i32
+      // T_2.64 = zext T_1.32 to i64
+      // Dest.<<right_size>> = T_2.<<right_size>>
+      //
+      // which ensures the upper 32 bits will always be cleared. Just doing a
+      //
+      // mov Dest.32 = trunc T.32 to i32
+      //
+      // is dangerous because there's a chance the compiler will optimize this
+      // copy out. To use _movzx we need two new registers (one 32-, and
+      // another 64-bit wide.)
+      Variable *T_1 = makeReg(IceType_i32);
+      _mov(T_1, T);
+      Variable *T_2 = makeReg(IceType_i64);
+      _movzx(T_2, T_1);
+      _mov(Dest, T_2);
     }
     return;
   }
@@ -2974,7 +3093,7 @@
     Operand *Val = legalize(Instr->getArg(0));
     Operand *FirstVal;
     Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       FirstVal = loOperand(Val);
       SecondVal = hiOperand(Val);
     } else {
@@ -2991,7 +3110,7 @@
     Operand *Val = legalize(Instr->getArg(0));
     Operand *FirstVal;
     Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       FirstVal = hiOperand(Val);
       SecondVal = loOperand(Val);
     } else {
@@ -3099,7 +3218,7 @@
 void TargetX86Base<Machine>::lowerAtomicCmpxchg(Variable *DestPrev,
                                                 Operand *Ptr, Operand *Expected,
                                                 Operand *Desired) {
-  if (Expected->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Expected->getType() == IceType_i64) {
     // Reserve the pre-colored registers first, before adding any more
     // infinite-weight variables from formMemoryOperand's legalization.
     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
@@ -3217,7 +3336,7 @@
     Func->setError("Unknown AtomicRMW operation");
     return;
   case Intrinsics::AtomicAdd: {
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // All the fall-through paths must set this to true, but use this
       // for asserting.
       NeedsCmpxchg = true;
@@ -3235,7 +3354,7 @@
     return;
   }
   case Intrinsics::AtomicSub: {
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       NeedsCmpxchg = true;
       Op_Lo = &TargetX86Base<Machine>::_sub;
       Op_Hi = &TargetX86Base<Machine>::_sbb;
@@ -3272,7 +3391,7 @@
     Op_Hi = &TargetX86Base<Machine>::_xor;
     break;
   case Intrinsics::AtomicExchange:
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       NeedsCmpxchg = true;
       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
       // just need to be moved to the ecx and ebx registers.
@@ -3326,7 +3445,7 @@
   // If Op_{Lo,Hi} are nullptr, then just copy the value.
   Val = legalize(Val);
   Type Ty = Val->getType();
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
     typename Traits::X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
@@ -3458,7 +3577,7 @@
   if (!Cttz) {
     _xor(T_Dest, ThirtyOne);
   }
-  if (Ty == IceType_i32) {
+  if (Traits::Is64Bit || Ty == IceType_i32) {
     _mov(Dest, T_Dest);
     return;
   }
@@ -4138,7 +4257,7 @@
     std::swap(SrcT, SrcF);
     Cond = InstX86Base<Machine>::getOppositeCondition(Cond);
   }
-  if (DestTy == IceType_i64) {
+  if (!Traits::Is64Bit && DestTy == IceType_i64) {
     SrcT = legalizeUndef(SrcT);
     SrcF = legalizeUndef(SrcF);
     // Set the low portion.
@@ -4160,7 +4279,8 @@
     return;
   }
 
-  assert(DestTy == IceType_i16 || DestTy == IceType_i32);
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
+         (Traits::Is64Bit && DestTy == IceType_i64));
   Variable *T = nullptr;
   SrcF = legalize(SrcF);
   _mov(T, SrcF);
@@ -4177,7 +4297,7 @@
       formMemoryOperand(Addr, Value->getType());
   Type Ty = NewAddr->getType();
 
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Value = legalizeUndef(Value);
     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
@@ -4225,7 +4345,7 @@
                                                uint64_t Min, uint64_t Max) {
   // TODO(ascull): 64-bit should not reach here but only because it is not
   // implemented yet. This should be able to handle the 64-bit case.
-  assert(Comparison->getType() != IceType_i64);
+  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
   // Subtracting 0 is a nop so don't do it
   if (Min != 0) {
     // Avoid clobbering the comparison by copying it
@@ -4324,7 +4444,7 @@
 
   assert(CaseClusters.size() != 0); // Should always be at least one
 
-  if (Src0->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
     Src0 = legalize(Src0); // get Base/Index into physical registers
     Operand *Src0Lo = loOperand(Src0);
     Operand *Src0Hi = hiOperand(Src0);
@@ -4529,7 +4649,7 @@
   Operand *Src = RMW->getData();
   Type Ty = Src->getType();
   typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Src = legalizeUndef(Src);
     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
@@ -4563,7 +4683,8 @@
       return;
     }
   } else {
-    // i8, i16, i32
+    // x86-32: i8, i16, i32
+    // x86-64: i8, i16, i32, i64
     switch (RMW->getOp()) {
     default:
       // TODO(stichnot): Implement other arithmetic operators.
@@ -4608,8 +4729,14 @@
 /// turned into zeroes, since loOperand() and hiOperand() don't expect
 /// Undef input.
 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() {
-  // Pause constant blinding or pooling, blinding or pooling will be done later
-  // during phi lowering assignments
+  if (Traits::Is64Bit) {
+    // On x86-64 we don't need to prelower phis -- the architecture can handle
+    // 64-bit integer natively.
+    return;
+  }
+
+  // Pause constant blinding or pooling, blinding or pooling will be done
+  // later during phi lowering assignments
   BoolFlagSaver B(RandomizationPoolingPaused, true);
   PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>(
       this, Context.getNode(), Func);
@@ -4770,6 +4897,16 @@
     // There should be no constants of vector type (other than undef).
     assert(!isVectorType(Ty));
 
+    // If the operand is a 64 bit constant integer we need to legalize it to a
+    // register in x86-64.
+    if (Traits::Is64Bit) {
+      if (llvm::isa<ConstantInteger64>(Const)) {
+        Variable *V = copyToReg(Const, RegNum);
+        V->setWeightInfinite();
+        return V;
+      }
+    }
+
     // If the operand is an 32 bit constant integer, we should check
     // whether we need to randomize it or pool it.
     if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
@@ -4907,7 +5044,7 @@
 template <class Machine>
 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for x86-32.
-  assert(Type != IceType_i64);
+  assert(Traits::Is64Bit || Type != IceType_i64);
   Variable *Reg = Func->makeVariable(Type);
   if (RegNum == Variable::NoRegister)
     Reg->setWeightInfinite();
@@ -4939,8 +5076,15 @@
 }
 
 template <class Machine>
-void TargetX86Base<Machine>::emit(const ConstantInteger64 *) const {
-  llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+void TargetX86Base<Machine>::emit(const ConstantInteger64 *C) const {
+  if (!Traits::Is64Bit) {
+    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+  } else {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    Str << getConstantPrefix() << C->getValue();
+  }
 }
 
 template <class Machine>
@@ -5085,8 +5229,8 @@
                                           MemOperand->getBase(), Mask1);
         // If we have already assigned a physical register, we must come from
         // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
-        // the assigned register as this assignment is that start of its use-def
-        // chain. So we add RegNum argument here.
+        // the assigned register as this assignment is that start of its
+        // use-def chain. So we add RegNum argument here.
         Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
         _lea(RegTemp, TempMemOperand);
         // As source operand doesn't use the dstreg, we don't need to add