Subzero. Native 64-bit int arithmetic on x86-64.

This CL modifies the x86 instruction selection template to allow native
64-bit GPR support. It also enables x86-64 crosstests.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1273153002.
diff --git a/Makefile.standalone b/Makefile.standalone
index 0248a11..d53332f 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -325,6 +325,7 @@
 
 RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c
 RT_OBJ := build/runtime/szrt_native_x8632.o build/runtime/szrt_sb_x8632.o \
+	build/runtime/szrt_native_x8664.o build/runtime/szrt_sb_x8664.o \
 	build/runtime/szrt_native_arm32.o build/runtime/szrt_sb_arm32.o
 
 runtime: $(RT_OBJ)
@@ -348,10 +349,13 @@
 check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
        # Do all native/sse2 tests, but only test_vector_ops for native/sse4.1.
        # For (slow) sandboxed tests, limit to Om1/sse4.1.
+       # TODO(jpp): implement x8664 sandbox, then enable xtests.
 	./pydir/crosstest_generator.py -v --lit \
 	  --toolchain-root $(TOOLCHAIN_ROOT) \
 	  -i x8632,native,sse2 -i x8632,native,sse4.1,test_vector_ops \
 	  -i x8632,sandbox,sse4.1,Om1 \
+	  -i x8664,native,sse2 -i x8664,native,sse4.1,test_vector_ops \
+	  -e x8664,native,sse2,test_global \
 	  -i arm32,native,neon,simple_loop \
 	  -i arm32,native,neon,mem_intrin \
 	  -i arm32,native,neon,test_bitmanip \
diff --git a/crosstest/mem_intrin.cpp b/crosstest/mem_intrin.cpp
index 612edce..0fe0387 100644
--- a/crosstest/mem_intrin.cpp
+++ b/crosstest/mem_intrin.cpp
@@ -8,6 +8,7 @@
 #include <cstring>
 
 #include "mem_intrin.h"
+#include "xdefs.h"
 
 typedef int elem_t;
 
@@ -15,9 +16,9 @@
  * Reset buf to the sequence of bytes: n, n+1, n+2 ... length - 1
  */
 static void __attribute__((noinline))
-reset_buf(uint8_t *buf, uint8_t init, size_t length) {
-  size_t i;
-  size_t v = init;
+reset_buf(uint8_t *buf, uint8_t init, SizeT length) {
+  SizeT i;
+  SizeT v = init;
   for (i = 0; i < length; ++i)
     buf[i] = v++;
 }
@@ -27,8 +28,8 @@
  * smaller buffers, whose total won't approach 2**16).
  */
 static int __attribute__((noinline))
-fletcher_checksum(uint8_t *buf, size_t length) {
-  size_t i;
+fletcher_checksum(uint8_t *buf, SizeT length) {
+  SizeT i;
   int sum = 0;
   int sum_of_sums = 0;
   const int kModulus = 255;
@@ -63,20 +64,20 @@
   return fletcher_checksum((uint8_t *)buf, BYTE_LENGTH);
 }
 
-int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
   reset_buf(buf, init, length);
   memcpy((void *)buf2, (void *)buf, length);
   return fletcher_checksum(buf2, length);
 }
 
-int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
   int sum1;
   int sum2;
   const int overlap_bytes = 4 * sizeof(elem_t);
   if (length <= overlap_bytes)
     return 0;
   uint8_t *overlap_buf = buf + overlap_bytes;
-  size_t reduced_length = length - overlap_bytes;
+  SizeT reduced_length = length - overlap_bytes;
   reset_buf(buf, init, length);
 
   /* Test w/ overlap. */
@@ -88,7 +89,7 @@
   return sum1 + sum2;
 }
 
-int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length) {
+int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length) {
   memset((void *)buf, init, length);
   memset((void *)buf2, init + 4, length);
   return fletcher_checksum(buf, length) + fletcher_checksum(buf2, length);
diff --git a/crosstest/mem_intrin.h b/crosstest/mem_intrin.h
index 70f02ae..f04e1b2 100644
--- a/crosstest/mem_intrin.h
+++ b/crosstest/mem_intrin.h
@@ -4,10 +4,11 @@
  * There is no include guard since this will be included multiple times,
  * under different namespaces.
  */
+#include "xdefs.h"
 
-int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
-int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
-int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, size_t length);
+int memcpy_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
+int memmove_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
+int memset_test(uint8_t *buf, uint8_t *buf2, uint8_t init, SizeT length);
 
 int memcpy_test_fixed_len(uint8_t init);
 int memmove_test_fixed_len(uint8_t init);
diff --git a/crosstest/mem_intrin_main.cpp b/crosstest/mem_intrin_main.cpp
index 70e3a67..e1102ec 100644
--- a/crosstest/mem_intrin_main.cpp
+++ b/crosstest/mem_intrin_main.cpp
@@ -5,6 +5,8 @@
 #include <cstdio>
 
 #include "mem_intrin.h"
+#include "xdefs.h"
+
 namespace Subzero_ {
 #include "mem_intrin.h"
 }
@@ -12,7 +14,7 @@
 #define XSTR(s) STR(s)
 #define STR(s) #s
 
-void testFixedLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+void testFixedLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
 #define do_test_fixed(test_func)                                               \
   for (uint8_t init_val = 0; init_val < 100; ++init_val) {                     \
     ++TotalTests;                                                              \
@@ -33,11 +35,11 @@
 #undef do_test_fixed
 }
 
-void testVariableLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+void testVariableLen(SizeT &TotalTests, SizeT &Passes, SizeT &Failures) {
   uint8_t buf[256];
   uint8_t buf2[256];
 #define do_test_variable(test_func)                                            \
-  for (size_t len = 4; len < 128; ++len) {                                     \
+  for (SizeT len = 4; len < 128; ++len) {                                      \
     for (uint8_t init_val = 0; init_val < 100; ++init_val) {                   \
       ++TotalTests;                                                            \
       int llc_result = test_func(buf, buf2, init_val, len);                    \
@@ -58,7 +60,11 @@
 #undef do_test_variable
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   unsigned TotalTests = 0;
   unsigned Passes = 0;
   unsigned Failures = 0;
diff --git a/crosstest/simple_loop_main.c b/crosstest/simple_loop_main.c
index 5ff36b8..6c738b9 100644
--- a/crosstest/simple_loop_main.c
+++ b/crosstest/simple_loop_main.c
@@ -6,7 +6,11 @@
 int simple_loop(int *a, int n);
 int Subzero_simple_loop(int *a, int n);
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   unsigned TotalTests = 0;
   unsigned Passes = 0;
   unsigned Failures = 0;
diff --git a/crosstest/stack_hack.x8664.c b/crosstest/stack_hack.x8664.c
new file mode 100644
index 0000000..45b0bb7
--- /dev/null
+++ b/crosstest/stack_hack.x8664.c
@@ -0,0 +1,76 @@
+//===- subzero/crosstest/stack_hack.x8664.c - X8664 stack hack ------------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements main() for crosstests in x86-64.
+//
+//===----------------------------------------------------------------------===//
+#include <assert.h>
+#include <stdint.h>
+
+#include <sys/mman.h>
+
+// X8664_STACK_HACK needs to be defined before xdefs.h is included.
+#define X8664_STACK_HACK
+#include "xdefs.h"
+
+/// xSetStack is used to set %rsp to NewRsp. OldRsp is a pointer that will be
+/// used to save the old %rsp value.
+#define xSetStack(NewRsp, OldRsp)                                              \
+  do {                                                                         \
+    __asm__ volatile("xchgq   %1, %%rsp\n\t"                                   \
+                     "xchgq   %1, %0"                                          \
+                     : "=r"(*(OldRsp))                                         \
+                     : "r"(NewRsp));                                           \
+  } while (0)
+
+extern int wrapped_main(int argc, char *argv[]);
+
+unsigned char *xStackStart(uint32 StackEnd, uint32 Size) {
+  const uint32 PageBoundary = 4 << 20; // 4 MB.
+  const uint64 StackStart = StackEnd - Size;
+  assert(StackStart + (PageBoundary - 1) & ~(PageBoundary - 1) &&
+         "StackStart not aligned to page boundary.");
+  (void)PageBoundary;
+  assert((StackStart & 0xFFFFFFFF00000000ull) == 0 && "StackStart wraps.");
+  return (unsigned char *)StackStart;
+}
+
+unsigned char *xAllocStack(uint64 StackEnd, uint32 Size) {
+  assert((StackEnd & 0xFFFFFFFF00000000ull) == 0 && "Invalid StackEnd.");
+  void *Stack =
+      mmap(xStackStart(StackEnd, Size), Size, PROT_READ | PROT_WRITE,
+           MAP_FIXED | MAP_PRIVATE | MAP_GROWSDOWN | MAP_ANONYMOUS, -1, 0);
+  assert(Stack != MAP_FAILED && "mmap failed. no stack.");
+  return Stack;
+}
+
+void xDeallocStack(uint64 StackEnd, uint32 Size) {
+  assert((StackEnd & 0xFFFFFFFF00000000ull) == 0 && "Invalid StackEnd.");
+  munmap(xStackStart(StackEnd, Size), Size);
+}
+
+int main(int argc, char *argv[]) {
+  // These "locals" need to live **NOT** in the stack.
+  static int Argc;
+  static char **Argv;
+  static const uint32_t StackEnd = 0x80000000;
+  static const uint32_t StackSize = 40 * 1024 * 1024;
+  static unsigned char *new_rsp;
+  static unsigned char *old_rsp;
+  static unsigned char *dummy_rsp;
+  static int Failures;
+  Argc = argc;
+  Argv = argv;
+  new_rsp = xAllocStack(StackEnd, StackSize) + StackSize;
+  xSetStack(new_rsp, &old_rsp);
+  Failures = wrapped_main(Argc, Argv);
+  xSetStack(old_rsp, &new_rsp);
+  xDeallocStack(StackEnd, StackSize);
+  return Failures;
+}
diff --git a/crosstest/test_arith.cpp b/crosstest/test_arith.cpp
index 446ea04..47fd47b 100644
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -17,13 +17,14 @@
 #include <stdint.h>
 
 #include "test_arith.h"
+#include "xdefs.h"
 
 #define X(inst, op, isdiv, isshift)                                            \
   bool test##inst(bool a, bool b) { return a op b; }                           \
   uint8_t test##inst(uint8_t a, uint8_t b) { return a op b; }                  \
   uint16_t test##inst(uint16_t a, uint16_t b) { return a op b; }               \
   uint32_t test##inst(uint32_t a, uint32_t b) { return a op b; }               \
-  uint64_t test##inst(uint64_t a, uint64_t b) { return a op b; }               \
+  uint64 test##inst(uint64 a, uint64 b) { return a op b; }                     \
   v4ui32 test##inst(v4ui32 a, v4ui32 b) { return a op b; }                     \
   v8ui16 test##inst(v8ui16 a, v8ui16 b) { return a op b; }                     \
   v16ui8 test##inst(v16ui8 a, v16ui8 b) { return a op b; }
@@ -35,7 +36,7 @@
   myint8_t test##inst(myint8_t a, myint8_t b) { return a op b; }               \
   int16_t test##inst(int16_t a, int16_t b) { return a op b; }                  \
   int32_t test##inst(int32_t a, int32_t b) { return a op b; }                  \
-  int64_t test##inst(int64_t a, int64_t b) { return a op b; }                  \
+  int64 test##inst(int64 a, int64 b) { return a op b; }                        \
   v4si32 test##inst(v4si32 a, v4si32 b) { return a op b; }                     \
   v8si16 test##inst(v8si16 a, v8si16 b) { return a op b; }                     \
   v16si8 test##inst(v16si8 a, v16si8 b) { return a op b; }
diff --git a/crosstest/test_arith.h b/crosstest/test_arith.h
index be7f0b1..e348418 100644
--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -14,6 +14,7 @@
 
 #include <stdint.h>
 #include "test_arith.def"
+#include "xdefs.h"
 
 #include "vectors.h"
 
@@ -22,7 +23,7 @@
   uint8_t test##inst(uint8_t a, uint8_t b);                                    \
   uint16_t test##inst(uint16_t a, uint16_t b);                                 \
   uint32_t test##inst(uint32_t a, uint32_t b);                                 \
-  uint64_t test##inst(uint64_t a, uint64_t b);                                 \
+  uint64 test##inst(uint64 a, uint64 b);                                       \
   v4ui32 test##inst(v4ui32 a, v4ui32 b);                                       \
   v8ui16 test##inst(v8ui16 a, v8ui16 b);                                       \
   v16ui8 test##inst(v16ui8 a, v16ui8 b);
@@ -34,7 +35,7 @@
   myint8_t test##inst(myint8_t a, myint8_t b);                                 \
   int16_t test##inst(int16_t a, int16_t b);                                    \
   int32_t test##inst(int32_t a, int32_t b);                                    \
-  int64_t test##inst(int64_t a, int64_t b);                                    \
+  int64 test##inst(int64 a, int64 b);                                          \
   v4si32 test##inst(v4si32 a, v4si32 b);                                       \
   v8si16 test##inst(v8si16 a, v8si16 b);                                       \
   v16si8 test##inst(v16si8 a, v16si8 b);
diff --git a/crosstest/test_arith_main.cpp b/crosstest/test_arith_main.cpp
index b4c70ee..8f420f1 100644
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -28,6 +28,8 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_arith.h"
+#include "xdefs.h"
+
 namespace Subzero_ {
 #include "test_arith.h"
 }
@@ -363,7 +365,11 @@
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
@@ -372,7 +378,7 @@
   testsInt<uint8_t, myint8_t>(TotalTests, Passes, Failures);
   testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
   testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
-  testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsInt<uint64, int64>(TotalTests, Passes, Failures);
   testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
   testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
   testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
diff --git a/crosstest/test_bitmanip.def b/crosstest/test_bitmanip.def
index 0dac033..443ad5a 100644
--- a/crosstest/test_bitmanip.def
+++ b/crosstest/test_bitmanip.def
@@ -14,6 +14,8 @@
 #ifndef TEST_BIT_MANIP_DEF
 #define TEST_BIT_MANIP_DEF
 
+#include "xdefs.h"
+
 #define STR(s) #s
 
 #define BMI_OPS  \
@@ -25,13 +27,13 @@
 
 #define BMI_TYPES \
   /* type */      \
-  X(uint32_t)     \
-  X(uint64_t)
+  X(uint32)     \
+  X(uint64)
 // #define X(type)
 
 #define FOR_ALL_BMI_TYPES_INST(F, inst) \
-  F(inst, uint32_t)                     \
-  F(inst, uint64_t)
+  F(inst, uint32)                     \
+  F(inst, uint64)
 
 #define FOR_ALL_BMI_OP_TYPES(X) \
   FOR_ALL_BMI_TYPES_INST(X, ctlz)     \
@@ -42,7 +44,7 @@
 #define BSWAP_TABLE              \
   /* type, builtin_name */       \
   X(uint16_t, __builtin_bswap16) \
-  X(uint32_t, __builtin_bswap32) \
-  X(uint64_t, __builtin_bswap64)
+  X(uint32, __builtin_bswap32) \
+  X(uint64, __builtin_bswap64)
 
 #endif // TEST_BIT_MANIP_DEF
diff --git a/crosstest/test_bitmanip_main.cpp b/crosstest/test_bitmanip_main.cpp
index dfb5d1a..f872f20 100644
--- a/crosstest/test_bitmanip_main.cpp
+++ b/crosstest/test_bitmanip_main.cpp
@@ -23,11 +23,13 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_bitmanip.h"
+#include "xdefs.h"
+
 namespace Subzero_ {
 #include "test_bitmanip.h"
 }
 
-volatile uint64_t Values[] = {
+volatile uint64 Values[] = {
     0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
     0xfffe, 0xffff, 0xc0de, 0xabcd, 0xdcba, 0x007fffff /*Max subnormal + */,
     0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */, 0x7f800000 /*+Inf*/,
@@ -71,9 +73,9 @@
       } else {
         ++Failures;
         std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                  << "(" << static_cast<uint64_t>(Value)
-                  << "): sz=" << static_cast<uint64_t>(ResultSz)
-                  << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+                  << "(" << static_cast<uint64>(Value)
+                  << "): sz=" << static_cast<uint64>(ResultSz)
+                  << " llc=" << static_cast<uint64>(ResultLlc) << "\n";
       }
     }
   }
@@ -101,24 +103,28 @@
       } else {
         ++Failures;
         std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                  << "(" << static_cast<uint64_t>(Value)
-                  << "): sz=" << static_cast<uint64_t>(ResultSz)
-                  << " llc=" << static_cast<uint64_t>(ResultLlc) << "\n";
+                  << "(" << static_cast<uint64>(Value)
+                  << "): sz=" << static_cast<uint64>(ResultSz)
+                  << " llc=" << static_cast<uint64>(ResultLlc) << "\n";
       }
     }
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
 
   testBitManip<uint32_t>(TotalTests, Passes, Failures);
-  testBitManip<uint64_t>(TotalTests, Passes, Failures);
+  testBitManip<uint64>(TotalTests, Passes, Failures);
   testByteSwap<uint16_t>(TotalTests, Passes, Failures);
   testByteSwap<uint32_t>(TotalTests, Passes, Failures);
-  testByteSwap<uint64_t>(TotalTests, Passes, Failures);
+  testByteSwap<uint64>(TotalTests, Passes, Failures);
 
   std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
             << " Failures=" << Failures << "\n";
diff --git a/crosstest/test_calling_conv.cpp b/crosstest/test_calling_conv.cpp
index e7fa616..364b0df 100644
--- a/crosstest/test_calling_conv.cpp
+++ b/crosstest/test_calling_conv.cpp
@@ -17,6 +17,7 @@
 #include <cstring>
 
 #include "test_calling_conv.h"
+#include "xdefs.h"
 
 #define CALL_AS_TYPE(Ty, Func) (reinterpret_cast<Ty *>(Func))
 
@@ -37,9 +38,9 @@
 
 void caller_vlvlivfvdviv(void) {
   v4f32 arg1 = {0, 1, 2, 3};
-  int64_t arg2 = 4;
+  int64 arg2 = 4;
   v4f32 arg3 = {6, 7, 8, 9};
-  int64_t arg4 = 10;
+  int64 arg4 = 10;
   int arg5 = 11;
   v4f32 arg6 = {12, 13, 14, 15};
   float arg7 = 16;
@@ -75,8 +76,8 @@
 }
 
 void __attribute__((noinline))
-callee_vlvlivfvdviv(v4f32 arg1, int64_t arg2, v4f32 arg3, int64_t arg4,
-                    int arg5, v4f32 arg6, float arg7, v4f32 arg8, double arg9,
+callee_vlvlivfvdviv(v4f32 arg1, int64 arg2, v4f32 arg3, int64 arg4, int arg5,
+                    v4f32 arg6, float arg7, v4f32 arg8, double arg9,
                     v4f32 arg10, int arg11, v4f32 arg12) {
   switch (ArgNum) {
     HANDLE_ARG(1);
diff --git a/crosstest/test_calling_conv.h b/crosstest/test_calling_conv.h
index 6cff49b..bf36465 100644
--- a/crosstest/test_calling_conv.h
+++ b/crosstest/test_calling_conv.h
@@ -14,6 +14,7 @@
 
 #include "test_calling_conv.def"
 #include "vectors.h"
+#include "xdefs.h"
 
 typedef void (*CalleePtrTy)();
 extern CalleePtrTy Callee;
@@ -31,6 +32,6 @@
 callee_vvvvv_Ty callee_vvvvv;
 
 void caller_vlvlivfvdviv();
-typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64_t, v4f32, int64_t, int, v4f32,
+typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64, v4f32, int64, int, v4f32,
                                      float, v4f32, double, v4f32, int, v4f32);
 callee_vlvlivfvdviv_Ty callee_vlvlivfvdviv;
diff --git a/crosstest/test_calling_conv_main.cpp b/crosstest/test_calling_conv_main.cpp
index ce5ecda..f0feef0 100644
--- a/crosstest/test_calling_conv_main.cpp
+++ b/crosstest/test_calling_conv_main.cpp
@@ -162,7 +162,11 @@
   }
 }
 
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_cast.cpp b/crosstest/test_cast.cpp
index 6298320..a2668f8 100644
--- a/crosstest/test_cast.cpp
+++ b/crosstest/test_cast.cpp
@@ -16,6 +16,7 @@
 
 #include <stdint.h>
 #include "test_cast.h"
+#include "xdefs.h"
 
 template <typename FromType, typename ToType>
 ToType __attribute__((noinline)) cast(FromType a) {
@@ -38,8 +39,8 @@
   static ToType f(uint16_t a) { return cast<uint16_t, ToType>(a); }
   static ToType f(int32_t a) { return cast<int32_t, ToType>(a); }
   static ToType f(uint32_t a) { return cast<uint32_t, ToType>(a); }
-  static ToType f(int64_t a) { return cast<int64_t, ToType>(a); }
-  static ToType f(uint64_t a) { return cast<uint64_t, ToType>(a); }
+  static ToType f(int64 a) { return cast<int64, ToType>(a); }
+  static ToType f(uint64 a) { return cast<uint64, ToType>(a); }
   static ToType f(float a) { return cast<float, ToType>(a); }
   static ToType f(double a) { return cast<double, ToType>(a); }
 };
@@ -56,8 +57,8 @@
 template class Caster<uint16_t>;
 template class Caster<int32_t>;
 template class Caster<uint32_t>;
-template class Caster<int64_t>;
-template class Caster<uint64_t>;
+template class Caster<int64>;
+template class Caster<uint64>;
 template class Caster<float>;
 template class Caster<double>;
 
@@ -67,8 +68,8 @@
 double makeBitCasters() {
   double Result = 0;
   Result += castBits<uint32_t, float>(0);
-  Result += castBits<uint64_t, double>(0);
+  Result += castBits<uint64, double>(0);
   Result += castBits<float, uint32_t>(0);
-  Result += castBits<double, uint64_t>(0);
+  Result += castBits<double, uint64>(0);
   return Result;
 }
diff --git a/crosstest/test_cast_main.cpp b/crosstest/test_cast_main.cpp
index c395597..4596bfa 100644
--- a/crosstest/test_cast_main.cpp
+++ b/crosstest/test_cast_main.cpp
@@ -22,6 +22,7 @@
 
 #include "test_arith.def"
 #include "vectors.h"
+#include "xdefs.h"
 
 // Include test_cast.h twice - once normally, and once within the
 // Subzero_ namespace, corresponding to the llc and Subzero translated
@@ -82,8 +83,8 @@
   COMPARE(cast, FromType, int16_t, Val, FromTypeString);
   COMPARE(cast, FromType, uint32_t, Val, FromTypeString);
   COMPARE(cast, FromType, int32_t, Val, FromTypeString);
-  COMPARE(cast, FromType, uint64_t, Val, FromTypeString);
-  COMPARE(cast, FromType, int64_t, Val, FromTypeString);
+  COMPARE(cast, FromType, uint64, Val, FromTypeString);
+  COMPARE(cast, FromType, int64, Val, FromTypeString);
   COMPARE(cast, FromType, float, Val, FromTypeString);
   COMPARE(cast, FromType, double, Val, FromTypeString);
 }
@@ -110,7 +111,11 @@
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
@@ -147,7 +152,7 @@
                                 0x80000000, 0x80000001, 0xfffffffe, 0xffffffff};
   static const size_t NumValsSi32 = sizeof(ValsSi32) / sizeof(*ValsSi32);
 
-  volatile uint64_t ValsUi64[] = {
+  volatile uint64 ValsUi64[] = {
       0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
       0xfffe, 0xffff, 0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
       0xfffffffe, 0xffffffff, 0x100000000ull, 0x100000001ull,
@@ -155,7 +160,7 @@
       0x8000000000000001ull, 0xfffffffffffffffeull, 0xffffffffffffffffull};
   static const size_t NumValsUi64 = sizeof(ValsUi64) / sizeof(*ValsUi64);
 
-  volatile int64_t ValsSi64[] = {
+  volatile int64 ValsSi64[] = {
       0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
       0xfffe, 0xffff, 0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
       0xfffffffe, 0xffffffff, 0x100000000ll, 0x100000001ll,
@@ -203,13 +208,13 @@
     testValue<int32_t>(Val, TotalTests, Passes, Failures, "int32_t");
   }
   for (size_t i = 0; i < NumValsUi64; ++i) {
-    uint64_t Val = ValsUi64[i];
-    testValue<uint64_t>(Val, TotalTests, Passes, Failures, "uint64_t");
-    COMPARE(castBits, uint64_t, double, Val, "uint64_t");
+    uint64 Val = ValsUi64[i];
+    testValue<uint64>(Val, TotalTests, Passes, Failures, "uint64");
+    COMPARE(castBits, uint64, double, Val, "uint64");
   }
   for (size_t i = 0; i < NumValsSi64; ++i) {
-    int64_t Val = ValsSi64[i];
-    testValue<int64_t>(Val, TotalTests, Passes, Failures, "int64_t");
+    int64 Val = ValsSi64[i];
+    testValue<int64>(Val, TotalTests, Passes, Failures, "int64");
   }
   for (size_t i = 0; i < NumValsF32; ++i) {
     for (unsigned j = 0; j < 2; ++j) {
@@ -226,7 +231,7 @@
       if (j > 0)
         Val = -Val;
       testValue<double>(Val, TotalTests, Passes, Failures, "double");
-      COMPARE(castBits, double, uint64_t, Val, "double");
+      COMPARE(castBits, double, uint64, Val, "double");
     }
   }
   testVector<v4ui32, v4f32>(TotalTests, Passes, Failures, "v4ui32", "v4f32");
diff --git a/crosstest/test_fcmp_main.cpp b/crosstest/test_fcmp_main.cpp
index 9e9c32c..b38de31 100644
--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
@@ -159,7 +159,11 @@
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_icmp.cpp b/crosstest/test_icmp.cpp
index 5ca2c46..6057545 100644
--- a/crosstest/test_icmp.cpp
+++ b/crosstest/test_icmp.cpp
@@ -15,12 +15,13 @@
 #include <stdint.h>
 
 #include "test_icmp.h"
+#include "xdefs.h"
 
 #define X(cmp, op)                                                             \
   bool icmp##cmp(uint8_t a, uint8_t b) { return a op b; }                      \
   bool icmp##cmp(uint16_t a, uint16_t b) { return a op b; }                    \
   bool icmp##cmp(uint32_t a, uint32_t b) { return a op b; }                    \
-  bool icmp##cmp(uint64_t a, uint64_t b) { return a op b; }                    \
+  bool icmp##cmp(uint64 a, uint64 b) { return a op b; }                        \
   v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; }                      \
   v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; }                      \
   v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; }
@@ -31,7 +32,7 @@
   bool icmp##cmp(myint8_t a, myint8_t b) { return a op b; }                    \
   bool icmp##cmp(int16_t a, int16_t b) { return a op b; }                      \
   bool icmp##cmp(int32_t a, int32_t b) { return a op b; }                      \
-  bool icmp##cmp(int64_t a, int64_t b) { return a op b; }                      \
+  bool icmp##cmp(int64 a, int64 b) { return a op b; }                          \
   v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; }                      \
   v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; }                      \
   v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; }
diff --git a/crosstest/test_icmp.h b/crosstest/test_icmp.h
index 8a264d0..e1ea495 100644
--- a/crosstest/test_icmp.h
+++ b/crosstest/test_icmp.h
@@ -15,12 +15,13 @@
 #include "test_icmp.def"
 
 #include "vectors.h"
+#include "xdefs.h"
 
 #define X(cmp, op)                                                             \
   bool icmp##cmp(uint8_t a, uint8_t b);                                        \
   bool icmp##cmp(uint16_t a, uint16_t b);                                      \
   bool icmp##cmp(uint32_t a, uint32_t b);                                      \
-  bool icmp##cmp(uint64_t a, uint64_t b);                                      \
+  bool icmp##cmp(uint64 a, uint64 b);                                          \
   v4ui32 icmp##cmp(v4ui32 a, v4ui32 b);                                        \
   v8ui16 icmp##cmp(v8ui16 a, v8ui16 b);                                        \
   v16ui8 icmp##cmp(v16ui8 a, v16ui8 b);
@@ -31,7 +32,7 @@
   bool icmp##cmp(myint8_t a, myint8_t b);                                      \
   bool icmp##cmp(int16_t a, int16_t b);                                        \
   bool icmp##cmp(int32_t a, int32_t b);                                        \
-  bool icmp##cmp(int64_t a, int64_t b);                                        \
+  bool icmp##cmp(int64 a, int64 b);                                            \
   v4si32 icmp##cmp(v4si32 a, v4si32 b);                                        \
   v8si16 icmp##cmp(v8si16 a, v8si16 b);                                        \
   v16si8 icmp##cmp(v16si8 a, v16si8 b);
diff --git a/crosstest/test_icmp_main.cpp b/crosstest/test_icmp_main.cpp
index f27d53b..82e5b66 100644
--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
@@ -23,10 +23,13 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_icmp.h"
+
 namespace Subzero_ {
 #include "test_icmp.h"
 }
 
+#include "xdefs.h"
+
 volatile unsigned Values[] = {
     0x0,        0x1,        0x7ffffffe, 0x7fffffff, 0x80000000, 0x80000001,
     0xfffffffe, 0xffffffff, 0x7e,       0x7f,       0x80,       0x81,
@@ -265,7 +268,11 @@
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
@@ -273,7 +280,7 @@
   testsInt<uint8_t, myint8_t>(TotalTests, Passes, Failures);
   testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
   testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
-  testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+  testsInt<uint64, int64>(TotalTests, Passes, Failures);
   testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
   testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
   testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
diff --git a/crosstest/test_select_main.cpp b/crosstest/test_select_main.cpp
index 5ccdcfb..1973416 100644
--- a/crosstest/test_select_main.cpp
+++ b/crosstest/test_select_main.cpp
@@ -130,7 +130,11 @@
   }
 }
 
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_stacksave_main.c b/crosstest/test_stacksave_main.c
index 0691025..f03304e 100644
--- a/crosstest/test_stacksave_main.c
+++ b/crosstest/test_stacksave_main.c
@@ -22,7 +22,11 @@
 DECLARE_TESTS()
 DECLARE_TESTS(Subzero_)
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_strengthreduce_main.cpp b/crosstest/test_strengthreduce_main.cpp
index 2c2aa98..acde64e 100644
--- a/crosstest/test_strengthreduce_main.cpp
+++ b/crosstest/test_strengthreduce_main.cpp
@@ -25,7 +25,11 @@
 #include "test_strengthreduce.h"
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_sync_atomic.def b/crosstest/test_sync_atomic.def
index f84afde..7c05571 100644
--- a/crosstest/test_sync_atomic.def
+++ b/crosstest/test_sync_atomic.def
@@ -14,6 +14,8 @@
 #ifndef TEST_SYNC_ATOMIC_DEF
 #define TEST_SYNC_ATOMIC_DEF
 
+#include "xdefs.h"
+
 #define STR(s) #s
 
 #define RMWOP_TABLE  \
@@ -30,14 +32,14 @@
   X(uint8_t)              \
   X(uint16_t)             \
   X(uint32_t)             \
-  X(uint64_t)
+  X(uint64)
 //#define X(type)
 
 #define FOR_ALL_RMWTYPES_INST(F, inst) \
   F(inst, uint8_t)                     \
   F(inst, uint16_t)                    \
   F(inst, uint32_t)                    \
-  F(inst, uint64_t)
+  F(inst, uint64)
 
 #define FOR_ALL_RMWOP_TYPES(X)      \
   FOR_ALL_RMWTYPES_INST(X, add)     \
diff --git a/crosstest/test_sync_atomic_main.cpp b/crosstest/test_sync_atomic_main.cpp
index 63dfc80..f935e0a 100644
--- a/crosstest/test_sync_atomic_main.cpp
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -28,11 +28,12 @@
 // Subzero_ namespace, corresponding to the llc and Subzero translated
 // object files, respectively.
 #include "test_sync_atomic.h"
+#include "xdefs.h"
 namespace Subzero_ {
 #include "test_sync_atomic.h"
 }
 
-volatile uint64_t Values[] = {
+volatile uint64 Values[] = {
     0, 1, 0x7e, 0x7f, 0x80, 0x81, 0xfe, 0xff, 0x7ffe, 0x7fff, 0x8000, 0x8001,
     0xfffe, 0xffff, 0x007fffff /*Max subnormal + */, 0x00800000 /*Min+ */,
     0x7f7fffff /*Max+ */, 0x7f800000 /*+Inf*/, 0xff800000 /*-Inf*/,
@@ -51,7 +52,7 @@
   volatile uint8_t l8;
   volatile uint16_t l16;
   volatile uint32_t l32;
-  volatile uint64_t l64;
+  volatile uint64 l64;
 } AtomicLocs;
 
 template <typename Type>
@@ -91,12 +92,12 @@
           } else {
             ++Failures;
             std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                      << "(" << static_cast<uint64_t>(Value1) << ", "
-                      << static_cast<uint64_t>(Value2)
-                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
-                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
-                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
-                      << " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
+                      << "(" << static_cast<uint64>(Value1) << ", "
+                      << static_cast<uint64>(Value2)
+                      << "): sz1=" << static_cast<uint64>(ResultSz1)
+                      << " llc1=" << static_cast<uint64>(ResultLlc1)
+                      << " sz2=" << static_cast<uint64>(ResultSz2)
+                      << " llc2=" << static_cast<uint64>(ResultLlc2) << "\n";
           }
         }
       }
@@ -137,12 +138,12 @@
           } else {
             ++Failures;
             std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
-                      << "(" << static_cast<uint64_t>(Value1) << ", "
-                      << static_cast<uint64_t>(Value2)
-                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
-                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
-                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
-                      << " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
+                      << "(" << static_cast<uint64>(Value1) << ", "
+                      << static_cast<uint64>(Value2)
+                      << "): sz1=" << static_cast<uint64>(ResultSz1)
+                      << " llc1=" << static_cast<uint64>(ResultLlc1)
+                      << " sz2=" << static_cast<uint64>(ResultSz2)
+                      << " llc2=" << static_cast<uint64>(ResultLlc2) << "\n";
           }
         }
       }
@@ -166,6 +167,22 @@
   return NULL;
 }
 
+#ifndef X8664_STACK_HACK
+void AllocStackForThread(uint32, pthread_attr_t *) {}
+#else  // defined(X8664_STACK_HACK)
+void AllocStackForThread(uint32 m, pthread_attr_t *attr) {
+  static const uint32_t ThreadStackBase = 0x60000000;
+  static const uint32_t ThreadStackSize = 4 << 20; // 4MB.
+  if (pthread_attr_setstack(
+          attr, xAllocStack(ThreadStackBase - 2 * m * ThreadStackSize,
+                            ThreadStackSize),
+          ThreadStackSize) != 0) {
+    std::cout << "pthread_attr_setstack: " << strerror(errno) << "\n";
+    abort();
+  }
+}
+#endif // X8664_STACK_HACK
+
 template <typename Type>
 void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
                           size_t &Passes, size_t &Failures) {
@@ -184,7 +201,7 @@
   const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
 
   // Just test a few values, otherwise it takes a *really* long time.
-  volatile uint64_t ValuesSubset[] = {1, 0x7e, 0x000fffffffffffffffll};
+  volatile uint64 ValuesSubset[] = {1, 0x7e, 0x000fffffffffffffffll};
   const size_t NumValuesSubset = sizeof(ValuesSubset) / sizeof(*ValuesSubset);
 
   for (size_t f = 0; f < NumFuncs; ++f) {
@@ -200,12 +217,18 @@
         ++TotalTests;
         const size_t NumThreads = 4;
         pthread_t t[NumThreads];
+        pthread_attr_t attr[NumThreads];
 
         // Try N threads w/ just Llc.
         *AtomicLoc = Value1;
         for (size_t m = 0; m < NumThreads; ++m) {
-          pthread_create(&t[m], NULL, &threadWrapper<Type>,
-                         reinterpret_cast<void *>(&TDataLlc));
+          pthread_attr_init(&attr[m]);
+          AllocStackForThread(m, &attr[m]);
+          if (pthread_create(&t[m], &attr[m], &threadWrapper<Type>,
+                             reinterpret_cast<void *>(&TDataLlc)) != 0) {
+            std::cout << "pthread_create failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
         }
         for (size_t m = 0; m < NumThreads; ++m) {
           pthread_join(t[m], NULL);
@@ -215,7 +238,9 @@
         // Try N threads w/ both Sz and Llc.
         *AtomicLoc = Value1;
         for (size_t m = 0; m < NumThreads; ++m) {
-          if (pthread_create(&t[m], NULL, &threadWrapper<Type>,
+          pthread_attr_init(&attr[m]);
+          AllocStackForThread(m, &attr[m]);
+          if (pthread_create(&t[m], &attr[m], &threadWrapper<Type>,
                              m % 2 == 0
                                  ? reinterpret_cast<void *>(&TDataLlc)
                                  : reinterpret_cast<void *>(&TDataSz)) != 0) {
@@ -238,18 +263,21 @@
         } else {
           ++Failures;
           std::cout << "test_with_threads_" << Funcs[f].Name
-                    << (8 * sizeof(Type)) << "("
-                    << static_cast<uint64_t>(Value1) << ", "
-                    << static_cast<uint64_t>(Value2)
-                    << "): llc=" << static_cast<uint64_t>(ResultLlc)
-                    << " mixed=" << static_cast<uint64_t>(ResultMixed) << "\n";
+                    << (8 * sizeof(Type)) << "(" << static_cast<uint64>(Value1)
+                    << ", " << static_cast<uint64>(Value2)
+                    << "): llc=" << static_cast<uint64>(ResultLlc)
+                    << " mixed=" << static_cast<uint64>(ResultMixed) << "\n";
         }
       }
     }
   }
 }
 
-int main(int argc, char **argv) {
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
+int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
@@ -257,18 +285,17 @@
   testAtomicRMW<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
   testAtomicRMW<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
   testAtomicRMW<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
-  testAtomicRMW<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMW<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
   testValCompareAndSwap<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
   testValCompareAndSwap<uint16_t>(&AtomicLocs.l16, TotalTests, Passes,
                                   Failures);
   testValCompareAndSwap<uint32_t>(&AtomicLocs.l32, TotalTests, Passes,
                                   Failures);
-  testValCompareAndSwap<uint64_t>(&AtomicLocs.l64, TotalTests, Passes,
-                                  Failures);
+  testValCompareAndSwap<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
   testAtomicRMWThreads<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
   testAtomicRMWThreads<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
   testAtomicRMWThreads<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
-  testAtomicRMWThreads<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint64>(&AtomicLocs.l64, TotalTests, Passes, Failures);
 
   std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
             << " Failures=" << Failures << "\n";
diff --git a/crosstest/test_vector_ops_main.cpp b/crosstest/test_vector_ops_main.cpp
index 1232799..4b9591a 100644
--- a/crosstest/test_vector_ops_main.cpp
+++ b/crosstest/test_vector_ops_main.cpp
@@ -130,7 +130,11 @@
   free(TestVectors);
 }
 
+#ifdef X8664_STACK_HACK
+extern "C" int wrapped_main(int argc, char *argv[]) {
+#else  // !defined(X8664_STACK_HACK)
 int main(int argc, char *argv[]) {
+#endif // X8664_STACK_HACK
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/xdefs.h b/crosstest/xdefs.h
new file mode 100644
index 0000000..00a4512
--- /dev/null
+++ b/crosstest/xdefs.h
@@ -0,0 +1,53 @@
+//===- subzero/crosstest/xdefs.h - Definitions for the crosstests. --------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the int64 and uint64 types to avoid link-time errors when compiling
+// the crosstests in LP64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_CROSSTEST_XDEFS_H_
+#define SUBZERO_CROSSTEST_XDEFS_H_
+
+typedef unsigned int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+typedef unsigned int SizeT;
+
+#ifdef X8664_STACK_HACK
+
+// the X86_STACK_HACK is an intrusive way of getting the crosstests to run in
+// x86_64 LP64 even with an ILP32 model. This hack allocates a new stack for
+// running the tests in the low 4GB of the address space.
+
+#ifdef __cplusplus
+#define XTEST_EXTERN extern "C"
+#else // !defined(__cplusplus)
+#define XTEST_EXTERN extern
+#endif // __cplusplus
+
+/// xAllocStack allocates the memory chunk [StackEnd - Size - 1, StackEnd). It
+/// requires StackEnd to be less than 32-bits long. Conversely, xDeallocStack
+/// frees that memory chunk.
+/// {@
+XTEST_EXTERN unsigned char *xAllocStack(uint64 StackEnd, uint32 Size);
+XTEST_EXTERN void xDeallocStack(uint64 StackEnd, uint32 Size);
+/// @}
+
+// wrapped_main is invoked by the x86-64 stack hack main. We declare a prototype
+// so the compiler (and not the linker) can yell if a test's wrapped_main
+// prototype does not match what we want.
+XTEST_EXTERN int wrapped_main(int argc, char *argv[]);
+
+#undef XTEST_EXTERN
+
+#endif // X8664_STACK_HACK
+
+#endif // SUBZERO_CROSSTEST_XDEFS_H_
diff --git a/pydir/build-runtime.py b/pydir/build-runtime.py
index 4010b51..ad38a2e 100755
--- a/pydir/build-runtime.py
+++ b/pydir/build-runtime.py
@@ -124,6 +124,8 @@
 
         MakeRuntimesForTarget(targets.X8632Target, ll_files,
                               srcdir, tempdir, rtdir, args.verbose)
+        MakeRuntimesForTarget(targets.X8664Target, ll_files,
+                              srcdir, tempdir, rtdir, args.verbose)
         MakeRuntimesForTarget(targets.ARM32Target, ll_files,
                               srcdir, tempdir, rtdir, args.verbose)
 
diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index c8ab306..d5f240a 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -177,6 +177,18 @@
             'szrt_{sb}_' + args.target + '.o'
             ).format(root=nacl_root, sb='sb' if args.sandbox else 'native'))
     pure_c = os.path.splitext(args.driver)[1] == '.c'
+
+    # TargetX8664 is ilp32, but clang does not currently support such
+    # configuration. In order to run the crosstests we play nasty, dangerous
+    # tricks with the stack pointer.
+    needs_stack_hack = (args.target == 'x8664')
+    stack_hack_params = []
+    if needs_stack_hack:
+      shellcmd('{bin}/clang -g -o stack_hack.x8664.{key}.o -c '
+               'stack_hack.x8664.c'.format(bin=bindir, key=key))
+      stack_hack_params.append('-DX8664_STACK_HACK')
+      stack_hack_params.append('stack_hack.x8664.{key}.o'.format(key=key))
+    
     # Set compiler to clang, clang++, pnacl-clang, or pnacl-clang++.
     compiler = '{bin}/{prefix}{cc}'.format(
         bin=bindir, prefix='pnacl-' if args.sandbox else '',
@@ -189,7 +201,7 @@
                        '-lm', '-lpthread',
                        '-Wl,--defsym=__Sz_AbsoluteZero=0'] +
                       target_info.cross_headers)
-    shellcmd([compiler, args.driver] + objs +
+    shellcmd([compiler] + stack_hack_params + [args.driver] + objs +
              ['-o', os.path.join(args.dir, args.output)] + sb_native_args)
 
 if __name__ == '__main__':
diff --git a/pydir/crosstest_generator.py b/pydir/crosstest_generator.py
index b5d64ab..dd72b80 100755
--- a/pydir/crosstest_generator.py
+++ b/pydir/crosstest_generator.py
@@ -55,15 +55,17 @@
   root = FindBaseNaCl()
 
   # The rest of the attribute sets.
-  targets = [ 'x8632', 'arm32' ]
+  targets = [ 'x8632', 'x8664', 'arm32' ]
   sandboxing = [ 'native', 'sandbox' ]
   opt_levels = [ 'Om1', 'O2' ]
   arch_attrs = { 'x8632': [ 'sse2', 'sse4.1' ],
+                 'x8664': [ 'sse2', 'sse4.1' ],
                  'arm32': [ 'neon', 'hwdiv-arm' ] }
   flat_attrs = []
   for v in arch_attrs.values():
     flat_attrs += v
   arch_flags = { 'x8632': [],
+                 'x8664': [],
                  # ARM doesn't have an integrated assembler yet.
                  'arm32': ['--filetype=asm'] }
   # all_keys is only used in the help text.
diff --git a/pydir/targets.py b/pydir/targets.py
index c2188e5..3635e13 100644
--- a/pydir/targets.py
+++ b/pydir/targets.py
@@ -40,6 +40,5 @@
                          ld_emu='armelf_nacl',
                          cross_headers=['-isystem', FindARMCrossInclude()])
 
-
 def ConvertTripleToNaCl(nonsfi_triple):
   return nonsfi_triple.replace('linux', 'nacl')
diff --git a/runtime/szrt_profiler.c b/runtime/szrt_profiler.c
index e31692e..34a647ac2 100644
--- a/runtime/szrt_profiler.c
+++ b/runtime/szrt_profiler.c
@@ -1,3 +1,4 @@
+#include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
 
@@ -53,7 +54,7 @@
   printf("%s", SubzeroLogo);
   for (const struct BlockProfileInfo **curr = &__Sz_block_profile_info;
        *curr != NULL; ++curr) {
-    printf("%lld\t%s\n", (*curr)->Counter, (*curr)->BlockName);
+    printf("%" PRIu64 "\t%s\n", (*curr)->Counter, (*curr)->BlockName);
   }
   fflush(stdout);
 }
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 9d872d2..c34b776 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -243,9 +243,9 @@
 
   // Cross Xmm/GPR cast instructions.
   template <typename DReg_t, typename SReg_t> struct CastEmitterRegOp {
-    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, SReg_t);
+    typedef void (AssemblerX86Base::*TypedEmitRegs)(Type, DReg_t, Type, SReg_t);
     typedef void (AssemblerX86Base::*TypedEmitAddr)(
-        Type, DReg_t, const typename Traits::Address &);
+        Type, DReg_t, Type, const typename Traits::Address &);
 
     TypedEmitRegs RegReg;
     TypedEmitAddr RegAddr;
@@ -299,7 +299,14 @@
            typename Traits::GPRRegister src);
   void mov(Type Ty, const typename Traits::Address &dst, const Immediate &imm);
 
-  void movFromAh(const typename Traits::GPRRegister dst);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister Dst, uint64_t Imm64);
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  movabs(const typename Traits::GPRRegister, uint64_t) {
+    llvm::report_fatal_error("movabs is only supported in 64-bit x86 targets.");
+  }
 
   void movzx(Type Ty, typename Traits::GPRRegister dst,
              typename Traits::GPRRegister src);
@@ -328,11 +335,13 @@
   void movss(Type Ty, typename Traits::XmmRegister dst,
              typename Traits::XmmRegister src);
 
-  void movd(typename Traits::XmmRegister dst, typename Traits::GPRRegister src);
-  void movd(typename Traits::XmmRegister dst,
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
+            typename Traits::GPRRegister src);
+  void movd(Type SrcTy, typename Traits::XmmRegister dst,
             const typename Traits::Address &src);
-  void movd(typename Traits::GPRRegister dst, typename Traits::XmmRegister src);
-  void movd(const typename Traits::Address &dst,
+  void movd(Type DestTy, typename Traits::GPRRegister dst,
+            typename Traits::XmmRegister src);
+  void movd(Type DestTy, const typename Traits::Address &dst,
             typename Traits::XmmRegister src);
 
   void movq(typename Traits::XmmRegister dst, typename Traits::XmmRegister src);
@@ -504,9 +513,9 @@
   void cvttps2dq(Type, typename Traits::XmmRegister dst,
                  const typename Traits::Address &src);
 
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                 typename Traits::GPRRegister src);
-  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst,
+  void cvtsi2ss(Type DestTy, typename Traits::XmmRegister dst, Type SrcTy,
                 const typename Traits::Address &src);
 
   void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
@@ -514,9 +523,9 @@
   void cvtfloat2float(Type SrcTy, typename Traits::XmmRegister dst,
                       const typename Traits::Address &src);
 
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                  typename Traits::XmmRegister src);
-  void cvttss2si(Type SrcTy, typename Traits::GPRRegister dst,
+  void cvttss2si(Type DestTy, typename Traits::GPRRegister dst, Type SrcTy,
                  const typename Traits::Address &src);
 
   void ucomiss(Type Ty, typename Traits::XmmRegister a,
@@ -719,6 +728,12 @@
   void cbw();
   void cwd();
   void cdq();
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type cqo();
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type cqo() {
+    llvm::report_fatal_error("CQO is only available in 64-bit x86 backends.");
+  }
 
   void div(Type Ty, typename Traits::GPRRegister reg);
   void div(Type Ty, const typename Traits::Address &address);
@@ -936,7 +951,7 @@
                      typename Traits::GPRRegister>::value;
 
     return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
-           isByteSizedArithType(Ty);
+           isByteSizedType(Ty);
   };
 
   // assembleAndEmitRex is used for determining which (if any) rex prefix should
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index f785756..2cb039a 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -207,6 +207,8 @@
     emitUint8(0xB0 + gprEncoding(dst));
     emitUint8(imm.value() & 0xFF);
   } else {
+    // TODO(jpp): When removing the assertion above ensure that in x86-64 we
+    // emit a 64-bit immediate.
     emitUint8(0xB8 + gprEncoding(dst));
     emitImmediate(Ty, imm);
   }
@@ -279,9 +281,34 @@
 }
 
 template <class Machine>
+template <typename T>
+typename std::enable_if<T::Is64Bit, void>::type
+AssemblerX86Base<Machine>::movabs(const typename Traits::GPRRegister Dst,
+                                  uint64_t Imm64) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  const bool NeedsRexW = (Imm64 & ~0xFFFFFFFFull) != 0;
+  const Type RexType = NeedsRexW ? RexTypeForceRexW : RexTypeIrrelevant;
+  emitRexB(RexType, Dst);
+  emitUint8(0xB8 | gprEncoding(Dst));
+  // When emitting Imm64, we don't have to mask out the upper 32 bits for
+  // emitInt32 will/should only emit a 32-bit constant. In reality, we are
+  // paranoid, so we go ahead an mask the upper bits out anyway.
+  emitInt32(Imm64 & 0xFFFFFFFF);
+  if (NeedsRexW)
+    emitInt32((Imm64 >> 32) & 0xFFFFFFFF);
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::movzx(Type SrcTy,
                                       typename Traits::GPRRegister dst,
                                       typename Traits::GPRRegister src) {
+  if (Traits::Is64Bit && SrcTy == IceType_i32) {
+    // 32-bit mov clears the upper 32 bits, hence zero-extending the 32-bit
+    // operand to 64-bit.
+    mov(IceType_i32, dst, src);
+    return;
+  }
+
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
@@ -295,6 +322,13 @@
 void AssemblerX86Base<Machine>::movzx(Type SrcTy,
                                       typename Traits::GPRRegister dst,
                                       const typename Traits::Address &src) {
+  if (Traits::Is64Bit && SrcTy == IceType_i32) {
+    // 32-bit mov clears the upper 32 bits, hence zero-extending the 32-bit
+    // operand to 64-bit.
+    mov(IceType_i32, dst, src);
+    return;
+  }
+
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   bool ByteSized = isByteSizedType(SrcTy);
   assert(ByteSized || SrcTy == IceType_i16);
@@ -359,7 +393,7 @@
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   else
-    assert(Ty == IceType_i32);
+    assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRexRB(Ty, dst, src);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
@@ -375,7 +409,7 @@
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   else
-    assert(Ty == IceType_i32);
+    assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRex(Ty, src, dst);
   emitUint8(0x0F);
   emitUint8(0x40 + cond);
@@ -423,44 +457,48 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::XmmRegister dst,
+void AssemblerX86Base<Machine>::movd(Type SrcTy,
+                                     typename Traits::XmmRegister dst,
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(SrcTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x6E);
   emitRegisterOperand(gprEncoding(dst), gprEncoding(src));
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::XmmRegister dst,
+void AssemblerX86Base<Machine>::movd(Type SrcTy,
+                                     typename Traits::XmmRegister dst,
                                      const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(SrcTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x6E);
   emitOperand(gprEncoding(dst), src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(typename Traits::GPRRegister dst,
+void AssemblerX86Base<Machine>::movd(Type DestTy,
+                                     typename Traits::GPRRegister dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRexRB(RexTypeIrrelevant, src, dst);
+  emitRexRB(DestTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x7E);
   emitRegisterOperand(gprEncoding(src), gprEncoding(dst));
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::movd(const typename Traits::Address &dst,
+void AssemblerX86Base<Machine>::movd(Type DestTy,
+                                     const typename Traits::Address &dst,
                                      typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x66);
-  emitRex(RexTypeIrrelevant, dst, src);
+  emitRex(DestTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x7E);
   emitOperand(gprEncoding(src), dst);
@@ -1343,7 +1381,7 @@
   // Load 32-bit immediate value into tmp1.
   mov(IceType_i32, tmp1, imm);
   // Move value from tmp1 into dst.
-  movd(dst, tmp1);
+  movd(IceType_i32, dst, tmp1);
   // Broadcast low lane into other three lanes.
   shufps(RexTypeIrrelevant, dst, dst, Immediate(0x0));
 }
@@ -1487,10 +1525,11 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::cvtsi2ss(Type DestTy,
                                          typename Traits::XmmRegister dst,
+                                         Type SrcTy,
                                          typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(SrcTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2A);
   emitXmmRegisterOperand(dst, src);
@@ -1499,10 +1538,11 @@
 template <class Machine>
 void AssemblerX86Base<Machine>::cvtsi2ss(Type DestTy,
                                          typename Traits::XmmRegister dst,
+                                         Type SrcTy,
                                          const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(DestTy) ? 0xF3 : 0xF2);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(SrcTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2A);
   emitOperand(gprEncoding(dst), src);
@@ -1534,24 +1574,26 @@
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::cvttss2si(Type SrcTy,
+void AssemblerX86Base<Machine>::cvttss2si(Type DestTy,
                                           typename Traits::GPRRegister dst,
+                                          Type SrcTy,
                                           typename Traits::XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
-  emitRexRB(RexTypeIrrelevant, dst, src);
+  emitRexRB(DestTy, dst, src);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitXmmRegisterOperand(dst, src);
 }
 
 template <class Machine>
-void AssemblerX86Base<Machine>::cvttss2si(Type SrcTy,
+void AssemblerX86Base<Machine>::cvttss2si(Type DestTy,
                                           typename Traits::GPRRegister dst,
+                                          Type SrcTy,
                                           const typename Traits::Address &src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(isFloat32Asserting32Or64(SrcTy) ? 0xF3 : 0xF2);
-  emitRex(RexTypeIrrelevant, src, dst);
+  emitRex(DestTy, src, dst);
   emitUint8(0x0F);
   emitUint8(0x2C);
   emitOperand(gprEncoding(dst), src);
@@ -2401,6 +2443,15 @@
 }
 
 template <class Machine>
+template <typename T>
+typename std::enable_if<T::Is64Bit, void>::type
+AssemblerX86Base<Machine>::cqo() {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitRexB(RexTypeForceRexW, RexRegIrrelevant);
+  emitUint8(0x99);
+}
+
+template <class Machine>
 void AssemblerX86Base<Machine>::div(Type Ty, typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
@@ -2459,7 +2510,8 @@
 void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister dst,
                                      typename Traits::GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  assert(Ty == IceType_i16 || Ty == IceType_i32 ||
+         (Traits::Is64Bit && Ty == IceType_i64));
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   emitRexRB(Ty, dst, src);
@@ -2472,7 +2524,8 @@
 void AssemblerX86Base<Machine>::imul(Type Ty, typename Traits::GPRRegister reg,
                                      const typename Traits::Address &address) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i16 || Ty == IceType_i32);
+  assert(Ty == IceType_i16 || Ty == IceType_i32 ||
+         (Traits::Is64Bit && Ty == IceType_i64));
   if (Ty == IceType_i16)
     emitOperandSizeOverride();
   emitRex(Ty, address, reg);
@@ -2790,8 +2843,7 @@
 void AssemblerX86Base<Machine>::bswap(Type Ty,
                                       typename Traits::GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
-  assert(Ty == IceType_i32);
-  (void)Ty;
+  assert(Ty == IceType_i32 || (Traits::Is64Bit && Ty == IceType_i64));
   emitRexB(Ty, reg);
   emitUint8(0x0F);
   emitUint8(0xC8 | gprEncoding(reg));
diff --git a/src/IceELFSection.h b/src/IceELFSection.h
index 5cf89a5..961d8d2 100644
--- a/src/IceELFSection.h
+++ b/src/IceELFSection.h
@@ -362,8 +362,7 @@
       llvm::report_fatal_error("Missing symbol mentioned in reloc");
 
     if (IsELF64) {
-      llvm_unreachable(
-          "Not tested -- check that Fixup.offset() is correct even for pc-rel");
+      // TODO(jpp): check that Fixup.offset() is correct even for pc-rel.
       Elf64_Rela Rela;
       Rela.r_offset = Fixup.position();
       Rela.setSymbolAndType(Symbol->getNumber(), Fixup.kind());
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index c6d6abf..3a56e1b 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -206,7 +206,7 @@
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
       Disp = CR->getOffset();
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Fixup = Asm->createFixup(RelFixup, CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX8664.cpp b/src/IceInstX8664.cpp
index 3709180..49dc9d8 100644
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -179,8 +179,8 @@
       Disp = static_cast<int32_t>(CI->getValue());
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp = CR->getOffset();
-      Fixup = Asm->createFixup(llvm::ELF::R_386_32, CR);
+      Disp = CR->getOffset() - 4;
+      Fixup = Asm->createFixup(PcRelFixup, CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 6d39005..b0eb1ad 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -1100,6 +1100,8 @@
     : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movsx> {
 public:
   static InstX86Movsx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
     return new (Func->allocate<InstX86Movsx>()) InstX86Movsx(Func, Dest, Src);
   }
 
@@ -1116,6 +1118,8 @@
     : public InstX86BaseUnaryopGPR<Machine, InstX86Base<Machine>::Movzx> {
 public:
   static InstX86Movzx *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    assert(typeWidthInBytes(Dest->getType()) >
+           typeWidthInBytes(Src->getType()));
     return new (Func->allocate<InstX86Movzx>()) InstX86Movzx(Func, Dest, Src);
   }
 
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 34417cf..4d26210 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -729,7 +729,8 @@
   } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.GPRImm))(Ty, VarReg, Immediate(Imm->getValue()));
   } else if (const auto Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(InstX86Base<Machine>::Traits::RelFixup, Reloc);
     (Asm->*(Emitter.GPRImm))(Ty, VarReg, Immediate(Reloc->getOffset(), Fixup));
   } else if (const auto Split = llvm::dyn_cast<
                  typename InstX86Base<Machine>::Traits::VariableSplit>(Src)) {
@@ -758,7 +759,8 @@
   } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.AddrImm))(Ty, Addr, Immediate(Imm->getValue()));
   } else if (const auto Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(InstX86Base<Machine>::Traits::RelFixup, Reloc);
     (Asm->*(Emitter.AddrImm))(Ty, Addr, Immediate(Reloc->getOffset(), Fixup));
   } else {
     llvm_unreachable("Unexpected operand type");
@@ -929,8 +931,8 @@
 
 template <class Machine, typename DReg_t, typename SReg_t,
           DReg_t (*destEnc)(int32_t), SReg_t (*srcEnc)(int32_t)>
-void emitIASCastRegOp(const Cfg *Func, Type DispatchTy, const Variable *Dest,
-                      const Operand *Src,
+void emitIASCastRegOp(const Cfg *Func, Type DestTy, const Variable *Dest,
+                      Type SrcTy, const Operand *Src,
                       const typename InstX86Base<Machine>::Traits::Assembler::
                           template CastEmitterRegOp<DReg_t, SReg_t> &Emitter) {
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
@@ -940,18 +942,18 @@
   if (const auto SrcVar = llvm::dyn_cast<Variable>(Src)) {
     if (SrcVar->hasReg()) {
       SReg_t SrcReg = srcEnc(SrcVar->getRegNum());
-      (Asm->*(Emitter.RegReg))(DispatchTy, DestReg, SrcReg);
+      (Asm->*(Emitter.RegReg))(DestTy, DestReg, SrcTy, SrcReg);
     } else {
       typename InstX86Base<Machine>::Traits::Address SrcStackAddr =
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(SrcVar);
-      (Asm->*(Emitter.RegAddr))(DispatchTy, DestReg, SrcStackAddr);
+      (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, SrcStackAddr);
     }
   } else if (const auto Mem = llvm::dyn_cast<
                  typename InstX86Base<Machine>::Traits::X86OperandMem>(Src)) {
     Mem->emitSegmentOverride(Asm);
-    (Asm->*(Emitter.RegAddr))(DispatchTy, DestReg, Mem->toAsmAddress(Asm));
+    (Asm->*(Emitter.RegAddr))(DestTy, DestReg, SrcTy, Mem->toAsmAddress(Asm));
   } else {
     llvm_unreachable("Unexpected operand type");
   }
@@ -1387,17 +1389,26 @@
   case IceType_i8:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    Str << "\tcbtw";
+    Str << "\t"
+        << "cbtw";
     break;
   case IceType_i16:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
-    Str << "\tcwtd";
+    Str << "\t"
+        << "cwtd";
     break;
   case IceType_i32:
     assert(this->getDest()->getRegNum() ==
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
-    Str << "\tcltd";
+    Str << "\t"
+        << "cltd";
+    break;
+  case IceType_i64:
+    assert(this->getDest()->getRegNum() ==
+           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    Str << "\t"
+        << "cdto";
     break;
   }
 }
@@ -1430,6 +1441,11 @@
            InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
     Asm->cdq();
     break;
+  case IceType_i64:
+    assert(this->getDest()->getRegNum() ==
+           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    Asm->cqo();
+    break;
   }
 }
 
@@ -1592,7 +1608,8 @@
   assert(this->getSrcSize() == 2);
   Operand *Src = this->getSrc(1);
   Type SrcTy = Src->getType();
-  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32);
+  assert(SrcTy == IceType_i16 || SrcTy == IceType_i32 ||
+         (InstX86Base<Machine>::Traits::Is64Bit));
   typename InstX86Base<Machine>::Traits::Assembler *Asm =
       Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
   if (const auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
@@ -1814,7 +1831,11 @@
   switch (Variant) {
   case Si2ss: {
     assert(isScalarIntegerType(SrcTy));
-    assert(typeWidthInBytes(SrcTy) <= 4);
+    if (!InstX86Base<Machine>::Traits::Is64Bit) {
+      assert(typeWidthInBytes(SrcTy) <= 4);
+    } else {
+      assert(SrcTy == IceType_i32 || SrcTy == IceType_i64);
+    }
     assert(isScalarFloatingType(DestTy));
     static const typename InstX86Base<Machine>::Traits::Assembler::
         template CastEmitterRegOp<
@@ -1828,13 +1849,17 @@
         typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR>(
-        Func, DestTy, Dest, Src, Emitter);
+        Func, DestTy, Dest, SrcTy, Src, Emitter);
     return;
   }
   case Tss2si: {
     assert(isScalarFloatingType(SrcTy));
     assert(isScalarIntegerType(DestTy));
-    assert(typeWidthInBytes(DestTy) <= 4);
+    if (!InstX86Base<Machine>::Traits::Is64Bit) {
+      assert(typeWidthInBytes(DestTy) <= 4);
+    } else {
+      assert(DestTy == IceType_i32 || DestTy == IceType_i64);
+    }
     static const typename InstX86Base<Machine>::Traits::Assembler::
         template CastEmitterRegOp<
             typename InstX86Base<Machine>::Traits::RegisterSet::GPRRegister,
@@ -1847,7 +1872,7 @@
         typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR,
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm>(
-        Func, SrcTy, Dest, Src, Emitter);
+        Func, DestTy, Dest, SrcTy, Src, Emitter);
     return;
   }
   case Float2float: {
@@ -2244,6 +2269,10 @@
   this->getDest()->emit(Func);
 }
 
+inline bool isIntegerConstant(const Operand *Op) {
+  return llvm::isa<ConstantInteger32>(Op) || llvm::isa<ConstantInteger64>(Op);
+}
+
 template <class Machine> void InstX86Mov<Machine>::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -2252,11 +2281,16 @@
   Operand *Src = this->getSrc(0);
   Type SrcTy = Src->getType();
   Type DestTy = this->getDest()->getType();
-  Str << "\tmov"
-      << (!isScalarFloatingType(DestTy)
-              ? this->getWidthString(SrcTy)
-              : InstX86Base<Machine>::Traits::TypeAttributes[DestTy].SdSsString)
-      << "\t";
+  if (InstX86Base<Machine>::Traits::Is64Bit && DestTy == IceType_i64 &&
+      isIntegerConstant(Src)) {
+    Str << "\tmovabs\t";
+  } else {
+    Str << "\tmov"
+        << (!isScalarFloatingType(DestTy)
+                ? this->getWidthString(SrcTy)
+                : InstX86Base<Machine>::Traits::TypeAttributes[DestTy]
+                      .SdSsString) << "\t";
+  }
   // For an integer truncation operation, src is wider than dest.
   // Ideally, we use a mov instruction whose data width matches the
   // narrower dest.  This is a problem if e.g. src is a register like
@@ -2320,6 +2354,20 @@
       assert(isScalarIntegerType(DestTy));
       // Widen DestTy for truncation (see above note). We should only do this
       // when both Src and Dest are integer types.
+      if (InstX86Base<Machine>::Traits::Is64Bit && DestTy == IceType_i64 &&
+          isIntegerConstant(Src)) {
+        uint64_t Value = -1;
+        if (const auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src)) {
+          Value = C64->getValue();
+        } else {
+          Value = llvm::cast<ConstantInteger32>(Src)->getValue();
+        }
+        Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>()
+            ->movabs(InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+                         Dest->getRegNum()),
+                     Value);
+        return;
+      }
       if (isScalarIntegerType(SrcTy)) {
         DestTy = SrcTy;
       }
@@ -2363,14 +2411,19 @@
   const auto SrcVar = llvm::cast<Variable>(this->getSrc(0));
   // For insert/extract element (one of Src/Dest is an Xmm vector and
   // the other is an int type).
-  if (SrcVar->getType() == IceType_i32) {
-    assert(isVectorType(Dest->getType()));
+  if (SrcVar->getType() == IceType_i32 ||
+      (InstX86Base<Machine>::Traits::Is64Bit &&
+       SrcVar->getType() == IceType_i64)) {
+    assert(isVectorType(Dest->getType()) ||
+           (isScalarFloatingType(Dest->getType()) &&
+            typeWidthInBytes(SrcVar->getType()) ==
+                typeWidthInBytes(Dest->getType())));
     assert(Dest->hasReg());
     typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister DestReg =
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm(
             Dest->getRegNum());
     if (SrcVar->hasReg()) {
-      Asm->movd(DestReg,
+      Asm->movd(SrcVar->getType(), DestReg,
                 InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
                     SrcVar->getRegNum()));
     } else {
@@ -2378,17 +2431,23 @@
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(SrcVar));
-      Asm->movd(DestReg, StackAddr);
+      Asm->movd(SrcVar->getType(), DestReg, StackAddr);
     }
   } else {
-    assert(isVectorType(SrcVar->getType()));
+    assert(isVectorType(SrcVar->getType()) ||
+           (isScalarFloatingType(SrcVar->getType()) &&
+            typeWidthInBytes(SrcVar->getType()) ==
+                typeWidthInBytes(Dest->getType())));
     assert(SrcVar->hasReg());
-    assert(Dest->getType() == IceType_i32);
+    assert(Dest->getType() == IceType_i32 ||
+           (InstX86Base<Machine>::Traits::Is64Bit &&
+            Dest->getType() == IceType_i64));
     typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister SrcReg =
         InstX86Base<Machine>::Traits::RegisterSet::getEncodedXmm(
             SrcVar->getRegNum());
     if (Dest->hasReg()) {
-      Asm->movd(InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
+      Asm->movd(Dest->getType(),
+                InstX86Base<Machine>::Traits::RegisterSet::getEncodedGPR(
                     Dest->getRegNum()),
                 SrcReg);
     } else {
@@ -2396,7 +2455,7 @@
           static_cast<typename InstX86Base<Machine>::Traits::TargetLowering *>(
               Func->getTarget())
               ->stackVarToAsmOperand(Dest));
-      Asm->movd(StackAddr, SrcReg);
+      Asm->movd(Dest->getType(), StackAddr, SrcReg);
     }
   }
 }
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6724a61..466564d 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -792,7 +792,7 @@
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, llvm::ELF::R_386_32);
+      Writer->writeJumpTable(JT, TargetX8632::Traits::RelFixup);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -821,7 +821,8 @@
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8632::Traits::RelFixup,
+                             SectionSuffix);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 29066aa..e0acbd6 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -68,6 +68,7 @@
   static const GPRRegister Encoded_Reg_Accumulator = RegX8632::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8632::Encoded_Reg_ecx;
   static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_386_32;
 
   class Operand {
   public:
@@ -272,6 +273,7 @@
   };
 
   static const char *TargetName;
+  static constexpr Type WordType = IceType_i32;
 
   static IceString getRegName(SizeT RegNum, Type Ty) {
     assert(RegNum < RegisterSet::Reg_NUM);
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 9056648..41d24cc 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -123,7 +123,7 @@
 }
 
 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
-// OperandList in lowerCall. std::max() was supposed to work, but it doesn't.
+// OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
 constexpr SizeT constexprMax(SizeT S0, SizeT S1) { return S0 < S1 ? S1 : S0; }
 
 } // end of anonymous namespace
@@ -239,7 +239,6 @@
   Variable *Dest = Instr->getDest();
   // ReturnReg doubles as ReturnRegLo as necessary.
   Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
   if (Dest) {
     switch (Dest->getType()) {
     case IceType_NUM:
@@ -250,12 +249,8 @@
     case IceType_i8:
     case IceType_i16:
     case IceType_i32:
-      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
-      break;
     case IceType_i64:
-      // TODO(jpp): return i64 in a GPR.
-      ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
-      ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      ReturnReg = makeReg(Dest->getType(), Traits::RegisterSet::Reg_eax);
       break;
     case IceType_f32:
     case IceType_f64:
@@ -271,27 +266,16 @@
     }
   }
 
-  Operand *CallTarget = legalize(Instr->getCallTarget());
+  Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm);
   const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
   if (NeedSandboxing) {
-    if (llvm::isa<Constant>(CallTarget)) {
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-    } else {
-      Variable *CallTargetVar = nullptr;
-      _mov(CallTargetVar, CallTarget);
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-      const SizeT BundleSize =
-          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
-      CallTarget = CallTargetVar;
-    }
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
   }
   Inst *NewCall = Traits::Insts::Call::create(Func, ReturnReg, CallTarget);
   Context.insert(NewCall);
-  if (NeedSandboxing)
-    _bundle_unlock();
-  if (ReturnRegHi)
-    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+  if (NeedSandboxing) {
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
+  }
 
   // Add the appropriate offset to esp.  The call instruction takes care
   // of resetting the stack offset during emission.
@@ -315,25 +299,11 @@
 
   assert(ReturnReg && "x86-64 always returns value on registers.");
 
-  // Assign the result of the call to Dest.
-  if (ReturnRegHi) {
-    assert(Dest->getType() == IceType_i64);
-    split64(Dest);
-    Variable *DestLo = Dest->getLo();
-    Variable *DestHi = Dest->getHi();
-    _mov(DestLo, ReturnReg);
-    _mov(DestHi, ReturnRegHi);
-    return;
-  }
-
-  assert(Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64 ||
-         Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-         Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-         isVectorType(Dest->getType()));
-
-  if (isScalarFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
+  if (isVectorType(Dest->getType())) {
     _movp(Dest, ReturnReg);
   } else {
+    assert(isScalarFloatingType(Dest->getType()) ||
+           isScalarIntegerType(Dest->getType()));
     _mov(Dest, ReturnReg);
   }
 }
@@ -356,36 +326,36 @@
        ++i) {
     Variable *Arg = Args[i];
     Type Ty = Arg->getType();
-    if ((isVectorType(Ty) || isScalarFloatingType(Ty)) &&
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      // Replace Arg in the argument list with the home register.  Then
-      // generate an instruction in the prolog to copy the home register
-      // to the assigned location of Arg.
-      int32_t RegNum = getRegisterForXmmArgNum(NumXmmArgs);
+    Variable *RegisterArg = nullptr;
+    int32_t RegNum = Variable::NoRegister;
+    if ((isVectorType(Ty) || isScalarFloatingType(Ty))) {
+      if (NumXmmArgs >= Traits::X86_MAX_XMM_ARGS) {
+        continue;
+      }
+      RegNum = getRegisterForXmmArgNum(NumXmmArgs);
       ++NumXmmArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-      RegisterArg->setRegNum(RegNum);
-      RegisterArg->setIsArg();
-      Arg->setIsArg(false);
-
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
-    } else if (isScalarIntegerType(Ty) &&
-               NumGprArgs < Traits::X86_MAX_GPR_ARGS) {
-      int32_t RegNum = getRegisterForGprArgNum(NumGprArgs);
+      RegisterArg = Func->makeVariable(Ty);
+    } else if (isScalarIntegerType(Ty)) {
+      if (NumGprArgs >= Traits::X86_MAX_GPR_ARGS) {
+        continue;
+      }
+      RegNum = getRegisterForGprArgNum(NumGprArgs);
       ++NumGprArgs;
-      Variable *RegisterArg = Func->makeVariable(Ty);
-      if (BuildDefs::dump())
-        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-      RegisterArg->setRegNum(RegNum);
-      RegisterArg->setIsArg();
-      Arg->setIsArg(false);
-
-      Args[i] = RegisterArg;
-      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+      RegisterArg = Func->makeVariable(Ty);
     }
+    assert(RegNum != Variable::NoRegister);
+    assert(RegisterArg != nullptr);
+    // Replace Arg in the argument list with the home register.  Then
+    // generate an instruction in the prolog to copy the home register
+    // to the assigned location of Arg.
+    if (BuildDefs::dump())
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+
+    Args[i] = RegisterArg;
+    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
   }
 }
 
@@ -393,19 +363,11 @@
   Variable *Reg = nullptr;
   if (Inst->hasRetValue()) {
     Operand *Src0 = legalize(Inst->getRetValue());
-    // TODO(jpp): this is not needed.
-    if (Src0->getType() == IceType_i64) {
-      Variable *eax =
-          legalizeToReg(loOperand(Src0), Traits::RegisterSet::Reg_eax);
-      Variable *edx =
-          legalizeToReg(hiOperand(Src0), Traits::RegisterSet::Reg_edx);
-      Reg = eax;
-      Context.insert(InstFakeUse::create(Func, edx));
-    } else if (isScalarFloatingType(Src0->getType())) {
-      _fld(Src0);
-    } else if (isVectorType(Src0->getType())) {
+    if (isVectorType(Src0->getType()) ||
+        isScalarFloatingType(Src0->getType())) {
       Reg = legalizeToReg(Src0, Traits::RegisterSet::Reg_xmm0);
     } else {
+      assert(isScalarIntegerType(Src0->getType()));
       _mov(Reg, Src0, Traits::RegisterSet::Reg_eax);
     }
   }
@@ -577,19 +539,17 @@
   unsigned NumGPRArgs = 0;
   for (Variable *Arg : Args) {
     // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType()) && NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      ++NumXmmArgs;
-      continue;
-    }
-    if (isScalarFloatingType(Arg->getType()) &&
-        NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
-      ++NumXmmArgs;
-      continue;
-    }
-    if (isScalarIntegerType(Arg->getType()) &&
-        NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
-      ++NumGPRArgs;
-      continue;
+    if (isVectorType(Arg->getType()) || isScalarFloatingType(Arg->getType())) {
+      if (NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+        ++NumXmmArgs;
+        continue;
+      }
+    } else {
+      assert(isScalarIntegerType(Arg->getType()));
+      if (NumGPRArgs < Traits::X86_MAX_GPR_ARGS) {
+        ++NumGPRArgs;
+        continue;
+      }
     }
     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   }
@@ -679,23 +639,9 @@
     }
   }
 
-  if (!Ctx->getFlags().getUseSandboxing())
-    return;
-  // Change the original ret instruction into a sandboxed return sequence.
-  // t:ecx = pop
-  // bundle_lock
-  // and t, ~31
-  // jmp *t
-  // bundle_unlock
-  // FakeUse <original_ret_operand>
-  Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
-  _pop(T_ecx);
-  lowerIndirectJump(T_ecx);
-  if (RI->getSrcSize()) {
-    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
-    Context.insert(InstFakeUse::create(Func, RetValue));
+  if (Ctx->getFlags().getUseSandboxing()) {
+    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
   }
-  RI->setDeleted();
 }
 
 void TargetX8664::emitJumpTable(const Cfg *Func,
@@ -858,8 +804,7 @@
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JumpTable : Ctx->getJumpTables())
-      // TODO(jpp): not 386.
-      Writer->writeJumpTable(JumpTable, llvm::ELF::R_386_32);
+      Writer->writeJumpTable(JumpTable, TargetX8664::Traits::RelFixup);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -888,8 +833,8 @@
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    // TODO(jpp): not 386.
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8664::Traits::RelFixup,
+                             SectionSuffix);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 89fc203..4a12004 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -66,7 +66,8 @@
   using RegisterSet = ::Ice::RegX8664;
   static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
-  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32; // TODO(jpp): ???
+  static const FixupKind PcRelFixup = llvm::ELF::R_X86_64_PC32;
+  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32S;
 
   class Operand {
   public:
@@ -270,8 +271,8 @@
 
     static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
       // TODO(jpp): ???
-      AssemblerFixup *Fixup = Asm->createFixup(llvm::ELF::R_386_32, Imm);
-      const RelocOffsetT Offset = 0;
+      AssemblerFixup *Fixup = Asm->createFixup(RelFixup, Imm);
+      const RelocOffsetT Offset = 4;
       return Address(ABSOLUTE, Offset, Fixup);
     }
   };
@@ -293,6 +294,7 @@
   };
 
   static const char *TargetName;
+  static constexpr Type WordType = IceType_i64;
 
   static IceString getRegName(SizeT RegNum, Type Ty) {
     assert(RegNum < RegisterSet::Reg_NUM);
@@ -331,7 +333,7 @@
 #define X(val, encode, name64, name32, name16, name8, scratch, preserved,      \
           stackptr, frameptr, isInt, isFP)                                     \
   (*IntegerRegisters)[RegisterSet::val] = isInt;                               \
-  (*IntegerRegistersI8)[RegisterSet::val] = 1;                                 \
+  (*IntegerRegistersI8)[RegisterSet::val] = isInt;                             \
   (*FloatRegisters)[RegisterSet::val] = isFP;                                  \
   (*VectorRegisters)[RegisterSet::val] = isFP;                                 \
   (*ScratchRegs)[RegisterSet::val] = scratch;
@@ -450,7 +452,7 @@
   /// address.
   static const uint32_t X86_STACK_ALIGNMENT_BYTES;
   /// Size of the return address on the stack
-  static const uint32_t X86_RET_IP_SIZE_BYTES = 4;
+  static const uint32_t X86_RET_IP_SIZE_BYTES = 8;
   /// The number of different NOP instructions
   static const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 342c97b..da863f4 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -21,6 +21,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceUtils.h"
 
 #include <type_traits>
 #include <utility>
@@ -80,10 +81,9 @@
                            : Traits::RegisterSet::Reg_esp;
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of 4 bytes.  In particular, i1,
-    // i8, and i16 are rounded up to 4 bytes.
-    // TODO(jpp): this needs to round to multiples of 8 bytes in x86-64.
-    return (typeWidthInBytes(Ty) + 3) & ~3;
+    // Round up to the next multiple of WordType bytes.
+    const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
+    return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
   }
 
   SizeT getMinJumpTableSize() const override { return 4; }
@@ -98,14 +98,40 @@
   void emit(const ConstantDouble *C) const final;
 
   void initNodeForLowering(CfgNode *Node) override;
-  /// Ensure that a 64-bit Variable has been split into 2 32-bit
+  /// x86-32: Ensure that a 64-bit Variable has been split into 2 32-bit
   /// Variables, creating them if necessary.  This is needed for all
   /// I64 operations, and it is needed for pushing F64 arguments for
   /// function calls using the 32-bit push instruction (though the
   /// latter could be done by directly writing to the stack).
-  void split64(Variable *Var);
-  Operand *loOperand(Operand *Operand);
-  Operand *hiOperand(Operand *Operand);
+  ///
+  /// x86-64: Complains loudly if invoked because the cpu can handle
+  /// 64-bit types natively.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type split64(Variable *Var);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type split64(Variable *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (split64)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  loOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *loOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (loOperand)");
+  }
+
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, Operand>::type *
+  hiOperand(Operand *Operand);
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, Operand>::type *hiOperand(Operand *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (hiOperand)");
+  }
+
   void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                               size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   typename Traits::Address stackVarToAsmOperand(const Variable *Var) const;
@@ -128,6 +154,19 @@
   void lowerExtractElement(const InstExtractElement *Inst) override;
   void lowerFcmp(const InstFcmp *Inst) override;
   void lowerIcmp(const InstIcmp *Inst) override;
+  /// Complains loudly if invoked because the cpu can handle 64-bit types
+  /// natively.
+  template <typename T = Traits>
+  typename std::enable_if<T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *) {
+    llvm::report_fatal_error(
+        "Hey, yo! This is x86-64. Watcha doin'? (lowerIcmp64)");
+  }
+  /// x86lowerIcmp64 handles 64-bit icmp lowering.
+  template <typename T = Traits>
+  typename std::enable_if<!T::Is64Bit, void>::type
+  lowerIcmp64(const InstIcmp *Inst);
+
   void lowerIntrinsicCall(const InstIntrinsicCall *Inst) override;
   void lowerInsertElement(const InstInsertElement *Inst) override;
   void lowerLoad(const InstLoad *Inst) override;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 8dad58e..e190b5d 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -77,6 +77,7 @@
 public:
   enum BoolFoldingProducerKind {
     PK_None,
+    // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
     PK_Icmp32,
     PK_Icmp64,
     PK_Fcmp,
@@ -120,7 +121,7 @@
 typename BoolFolding<MachineTraits>::BoolFoldingProducerKind
 BoolFolding<MachineTraits>::getProducerKind(const Inst *Instr) {
   if (llvm::isa<InstIcmp>(Instr)) {
-    if (Instr->getSrc(0)->getType() != IceType_i64)
+    if (MachineTraits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
       return PK_Icmp32;
     return PK_None; // TODO(stichnot): actually PK_Icmp64;
   }
@@ -643,10 +644,10 @@
       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
         // An AtomicLoad intrinsic qualifies as long as it has a valid
         // memory ordering, and can be implemented in a single
-        // instruction (i.e., not i64).
+        // instruction (i.e., not i64 on x86-32).
         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
         if (ID == Intrinsics::AtomicLoad &&
-            Intrin->getDest()->getType() != IceType_i64 &&
+            (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
             Intrinsics::isMemoryOrderValid(
                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
           LoadDest = Intrin->getDest();
@@ -724,6 +725,10 @@
 
 template <class Machine>
 Variable *TargetX86Base<Machine>::getPhysicalRegister(SizeT RegNum, Type Ty) {
+  // Special case: never allow partial reads/writes to/from %rBP and %rSP.
+  if (RegNum == Traits::RegisterSet::Reg_esp ||
+      RegNum == Traits::RegisterSet::Reg_ebp)
+    Ty = Traits::WordType;
   if (Ty == IceType_void)
     Ty = IceType_i32;
   if (PhysicalRegisters[Ty].empty())
@@ -770,7 +775,7 @@
   }
   if (Offset)
     Str << Offset;
-  const Type FrameSPTy = IceType_i32;
+  const Type FrameSPTy = Traits::WordType;
   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
 }
 
@@ -810,8 +815,7 @@
   Variable *Lo = Arg->getLo();
   Variable *Hi = Arg->getHi();
   Type Ty = Arg->getType();
-  if (Lo && Hi && Ty == IceType_i64) {
-    // TODO(jpp): This special case is not needed for x86-64.
+  if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) {
     assert(Lo->getType() != IceType_i64); // don't want infinite recursion
     assert(Hi->getType() != IceType_i64); // don't want infinite recursion
     finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
@@ -824,7 +828,7 @@
   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   if (Arg->hasReg()) {
-    assert(Ty != IceType_i64);
+    assert(Ty != IceType_i64 || Traits::Is64Bit);
     typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create(
         Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
     if (isVectorType(Arg->getType())) {
@@ -840,11 +844,13 @@
 }
 
 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() {
-  // TODO(jpp): this is wrong for x86-64.
-  return IceType_i32;
+  return Traits::WordType;
 }
 
-template <class Machine> void TargetX86Base<Machine>::split64(Variable *Var) {
+template <class Machine>
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX86Base<Machine>::split64(Variable *Var) {
   switch (Var->getType()) {
   default:
     return;
@@ -876,7 +882,9 @@
 }
 
 template <class Machine>
-Operand *TargetX86Base<Machine>::loOperand(Operand *Operand) {
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX86Base<Machine>::loOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64 ||
          Operand->getType() == IceType_f64);
   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
@@ -905,7 +913,9 @@
 }
 
 template <class Machine>
-Operand *TargetX86Base<Machine>::hiOperand(Operand *Operand) {
+template <typename T>
+typename std::enable_if<!T::Is64Bit, Operand>::type *
+TargetX86Base<Machine>::hiOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64 ||
          Operand->getType() == IceType_f64);
   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
@@ -1107,8 +1117,8 @@
     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
       std::swap(Src0, Src1);
   }
-  if (Dest->getType() == IceType_i64) {
-    // These helper-call-involved instructions are lowered in this
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
+    // These x86-32 helper-call-involved instructions are lowered in this
     // separate switch. This is because loOperand() and hiOperand()
     // may insert redundant instructions for constant blinding and
     // pooling. Such redundant instructions will fail liveness analysis
@@ -1656,7 +1666,8 @@
       Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx);
+      T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx);
+      _mov(T_edx, Zero);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
       _div(T_edx, Src1, T);
       _mov(Dest, T_edx);
@@ -1721,7 +1732,7 @@
       _mov(Dest, T);
       Context.insert(InstFakeUse::create(Func, T_eax));
     } else {
-      T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
+      T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx);
       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
       _cbwdq(T_edx, T);
       _idiv(T_edx, Src1, T);
@@ -1765,7 +1776,7 @@
   Variable *Dest = Inst->getDest();
   Operand *Src0 = Inst->getSrc(0);
   assert(Dest->getType() == Src0->getType());
-  if (Dest->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
     Src0 = legalize(Src0);
     Operand *Src0Lo = loOperand(Src0);
     Operand *Src0Hi = hiOperand(Src0);
@@ -1870,7 +1881,7 @@
         _psra(T, ShiftConstant);
         _movp(Dest, T);
       }
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
       Constant *Shift = Ctx->getConstantInt32(31);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1930,7 +1941,7 @@
       _movp(T, Src0RM);
       _pand(T, OneMask);
       _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // t1=movzx src; dst.lo=t1; dst.hi=0
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1951,13 +1962,16 @@
       // t = Src0RM; t &= 1; Dest = t
       Constant *One = Ctx->getConstantInt32(1);
       Type DestTy = Dest->getType();
-      Variable *T;
+      Variable *T = nullptr;
       if (DestTy == IceType_i8) {
-        T = makeReg(DestTy);
         _mov(T, Src0RM);
       } else {
+        assert(DestTy != IceType_i1);
+        assert(Traits::Is64Bit || DestTy != IceType_i64);
         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
-        T = makeReg(IceType_i32);
+        // In x86-64 we need to widen T to 64-bits to ensure that T -- if
+        // written to the stack (i.e., in -Om1) will be fully zero-extended.
+        T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
         _movzx(T, Src0RM);
       }
       _and(T, One);
@@ -1982,7 +1996,7 @@
       _movp(Dest, T);
     } else {
       Operand *Src0 = legalizeUndef(Inst->getSrc(0));
-      if (Src0->getType() == IceType_i64)
+      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
         Src0 = loOperand(Src0);
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // t1 = trunc Src0RM; Dest = t1
@@ -2013,7 +2027,7 @@
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
       _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // Use a helper for converting floating-point values to 64-bit
       // integers.  SSE2 appears to have no way to convert from xmm
       // registers to something like the edx:eax register pair, and
@@ -2032,7 +2046,15 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Dest->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Dest->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      // cvt() requires its integer argument to be a GPR.
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
@@ -2050,14 +2072,18 @@
       Call->addArg(Inst->getSrc(0));
       lowerCall(Call);
     } else if (Dest->getType() == IceType_i64 ||
-               Dest->getType() == IceType_i32) {
+               (!Traits::Is64Bit && Dest->getType() == IceType_i32)) {
       // Use a helper for both x86-32 and x86-64.
-      split64(Dest);
+      if (!Traits::Is64Bit)
+        split64(Dest);
       const SizeT MaxSrcs = 1;
       Type DestType = Dest->getType();
       Type SrcType = Inst->getSrc(0)->getType();
       IceString TargetString;
-      if (isInt32Asserting32Or64(DestType)) {
+      if (Traits::Is64Bit) {
+        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
+                                                         : H_fptoui_f64_i64;
+      } else if (isInt32Asserting32Or64(DestType)) {
         TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
                                                          : H_fptoui_f64_i32;
       } else {
@@ -2071,7 +2097,15 @@
     } else {
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
+      assert(Dest->getType() != IceType_i64);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Dest->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Dest->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
@@ -2090,7 +2124,7 @@
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
       _movp(Dest, T);
-    } else if (Inst->getSrc(0)->getType() == IceType_i64) {
+    } else if (!Traits::Is64Bit && Inst->getSrc(0)->getType() == IceType_i64) {
       // Use a helper for x86-32.
       const SizeT MaxSrcs = 1;
       Type DestType = Dest->getType();
@@ -2106,9 +2140,16 @@
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
       // Sign-extend the operand.
       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
+      if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
         _movsx(T_1, Src0RM);
@@ -2126,7 +2167,7 @@
       Call->addArg(Src0);
       lowerCall(Call);
     } else if (Src0->getType() == IceType_i64 ||
-               Src0->getType() == IceType_i32) {
+               (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
       // Use a helper for x86-32 and x86-64.  Also use a helper for
       // i32 on x86-32.
       const SizeT MaxSrcs = 1;
@@ -2147,9 +2188,17 @@
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       // Zero-extend the operand.
       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_1 = nullptr;
+      if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
+        T_1 = makeReg(IceType_i64);
+      } else {
+        assert(Src0RM->getType() != IceType_i64);
+        assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
+        T_1 = makeReg(IceType_i32);
+      }
+      T_1->setWeightInfinite();
       Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
+      if (Src0RM->getType() == T_1->getType())
         _mov(T_1, Src0RM);
       else
         _movzx(T_1, Src0RM);
@@ -2205,77 +2254,96 @@
       _mov(Dest, Spill);
     } break;
     case IceType_i64: {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      assert(Src0RM->getType() == IceType_f64);
-      // a.i64 = bitcast b.f64 ==>
-      //   s.f64 = spill b.f64
-      //   t_lo.i32 = lo(s.f64)
-      //   a_lo.i32 = t_lo.i32
-      //   t_hi.i32 = hi(s.f64)
-      //   a_hi.i32 = t_hi.i32
-      Operand *SpillLo, *SpillHi;
-      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
-        typename Traits::SpillVariable *SpillVar =
-            Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
-        SpillVar->setLinkedTo(Src0Var);
-        Variable *Spill = SpillVar;
-        Spill->setWeight(RegWeight::Zero);
-        _movq(Spill, Src0RM);
-        SpillLo = Traits::VariableSplit::create(Func, Spill,
-                                                Traits::VariableSplit::Low);
-        SpillHi = Traits::VariableSplit::create(Func, Spill,
-                                                Traits::VariableSplit::High);
+      assert(Src0->getType() == IceType_f64);
+      if (Traits::Is64Bit) {
+        // Movd requires its fp argument (in this case, the bitcast source) to
+        // be an xmm register.
+        Variable *Src0R = legalizeToReg(Src0);
+        Variable *T = makeReg(IceType_i64);
+        _movd(T, Src0R);
+        _mov(Dest, T);
       } else {
-        SpillLo = loOperand(Src0RM);
-        SpillHi = hiOperand(Src0RM);
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        // a.i64 = bitcast b.f64 ==>
+        //   s.f64 = spill b.f64
+        //   t_lo.i32 = lo(s.f64)
+        //   a_lo.i32 = t_lo.i32
+        //   t_hi.i32 = hi(s.f64)
+        //   a_hi.i32 = t_hi.i32
+        Operand *SpillLo, *SpillHi;
+        if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+          typename Traits::SpillVariable *SpillVar =
+              Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
+          SpillVar->setLinkedTo(Src0Var);
+          Variable *Spill = SpillVar;
+          Spill->setWeight(RegWeight::Zero);
+          _movq(Spill, Src0RM);
+          SpillLo = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::Low);
+          SpillHi = Traits::VariableSplit::create(Func, Spill,
+                                                  Traits::VariableSplit::High);
+        } else {
+          SpillLo = loOperand(Src0RM);
+          SpillHi = hiOperand(Src0RM);
+        }
+
+        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Variable *T_Lo = makeReg(IceType_i32);
+        Variable *T_Hi = makeReg(IceType_i32);
+
+        _mov(T_Lo, SpillLo);
+        _mov(DestLo, T_Lo);
+        _mov(T_Hi, SpillHi);
+        _mov(DestHi, T_Hi);
       }
-
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(IceType_i32);
-      Variable *T_Hi = makeReg(IceType_i32);
-
-      _mov(T_Lo, SpillLo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, SpillHi);
-      _mov(DestHi, T_Hi);
     } break;
     case IceType_f64: {
-      Src0 = legalize(Src0);
       assert(Src0->getType() == IceType_i64);
-      if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
-        Variable *T = Func->makeVariable(Dest->getType());
-        _movq(T, Src0);
-        _movq(Dest, T);
-        break;
-      }
-      // a.f64 = bitcast b.i64 ==>
-      //   t_lo.i32 = b_lo.i32
-      //   FakeDef(s.f64)
-      //   lo(s.f64) = t_lo.i32
-      //   t_hi.i32 = b_hi.i32
-      //   hi(s.f64) = t_hi.i32
-      //   a.f64 = s.f64
-      typename Traits::SpillVariable *SpillVar =
-          Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
-      SpillVar->setLinkedTo(Dest);
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
+      if (Traits::Is64Bit) {
+        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+        Variable *T = makeReg(IceType_f64);
+        // Movd requires its fp argument (in this case, the bitcast destination)
+        // to be an xmm register.
+        T->setWeightInfinite();
+        _movd(T, Src0RM);
+        _mov(Dest, T);
+      } else {
+        Src0 = legalize(Src0);
+        if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
+          Variable *T = Func->makeVariable(Dest->getType());
+          _movq(T, Src0);
+          _movq(Dest, T);
+          break;
+        }
+        // a.f64 = bitcast b.i64 ==>
+        //   t_lo.i32 = b_lo.i32
+        //   FakeDef(s.f64)
+        //   lo(s.f64) = t_lo.i32
+        //   t_hi.i32 = b_hi.i32
+        //   hi(s.f64) = t_hi.i32
+        //   a.f64 = s.f64
+        typename Traits::SpillVariable *SpillVar =
+            Func->makeVariable<typename Traits::SpillVariable>(IceType_f64);
+        SpillVar->setLinkedTo(Dest);
+        Variable *Spill = SpillVar;
+        Spill->setWeight(RegWeight::Zero);
 
-      Variable *T_Lo = nullptr, *T_Hi = nullptr;
-      typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create(
-          Func, Spill, Traits::VariableSplit::Low);
-      typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create(
-          Func, Spill, Traits::VariableSplit::High);
-      _mov(T_Lo, loOperand(Src0));
-      // Technically, the Spill is defined after the _store happens, but
-      // SpillLo is considered a "use" of Spill so define Spill before it
-      // is used.
-      Context.insert(InstFakeDef::create(Func, Spill));
-      _store(T_Lo, SpillLo);
-      _mov(T_Hi, hiOperand(Src0));
-      _store(T_Hi, SpillHi);
-      _movq(Dest, Spill);
+        Variable *T_Lo = nullptr, *T_Hi = nullptr;
+        typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::Low);
+        typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create(
+            Func, Spill, Traits::VariableSplit::High);
+        _mov(T_Lo, loOperand(Src0));
+        // Technically, the Spill is defined after the _store happens, but
+        // SpillLo is considered a "use" of Spill so define Spill before it
+        // is used.
+        Context.insert(InstFakeDef::create(Func, Spill));
+        _store(T_Lo, SpillLo);
+        _mov(T_Hi, hiOperand(Src0));
+        _store(T_Hi, SpillHi);
+        _movq(Dest, Spill);
+      }
     } break;
     case IceType_v8i1: {
       assert(Src0->getType() == IceType_i8);
@@ -2615,32 +2683,8 @@
     return;
   }
 
-  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
-  if (Src0->getType() == IceType_i64) {
-    InstIcmp::ICond Condition = Inst->getCondition();
-    size_t Index = static_cast<size_t>(Condition);
-    assert(Index < Traits::TableIcmp64Size);
-    Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
-    Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
-    Constant *Zero = Ctx->getConstantZero(IceType_i32);
-    Constant *One = Ctx->getConstantInt32(1);
-    typename Traits::Insts::Label *LabelFalse =
-        Traits::Insts::Label::create(Func, this);
-    typename Traits::Insts::Label *LabelTrue =
-        Traits::Insts::Label::create(Func, this);
-    _mov(Dest, One);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (Traits::TableIcmp64[Index].C1 != Traits::Cond::Br_None)
-      _br(Traits::TableIcmp64[Index].C1, LabelTrue);
-    if (Traits::TableIcmp64[Index].C2 != Traits::Cond::Br_None)
-      _br(Traits::TableIcmp64[Index].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(Traits::TableIcmp64[Index].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    _mov_nonkillable(Dest, Zero);
-    Context.insert(LabelTrue);
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
+    lowerIcmp64(Inst);
     return;
   }
 
@@ -2650,6 +2694,40 @@
   _setcc(Dest, Traits::getIcmp32Mapping(Inst->getCondition()));
 }
 
+template <typename Machine>
+template <typename T>
+typename std::enable_if<!T::Is64Bit, void>::type
+TargetX86Base<Machine>::lowerIcmp64(const InstIcmp *Inst) {
+  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
+  Operand *Src0 = legalize(Inst->getSrc(0));
+  Operand *Src1 = legalize(Inst->getSrc(1));
+  Variable *Dest = Inst->getDest();
+  InstIcmp::ICond Condition = Inst->getCondition();
+  size_t Index = static_cast<size_t>(Condition);
+  assert(Index < Traits::TableIcmp64Size);
+  Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+  Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+  Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
+  Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  Constant *One = Ctx->getConstantInt32(1);
+  typename Traits::Insts::Label *LabelFalse =
+      Traits::Insts::Label::create(Func, this);
+  typename Traits::Insts::Label *LabelTrue =
+      Traits::Insts::Label::create(Func, this);
+  _mov(Dest, One);
+  _cmp(Src0HiRM, Src1HiRI);
+  if (Traits::TableIcmp64[Index].C1 != Traits::Cond::Br_None)
+    _br(Traits::TableIcmp64[Index].C1, LabelTrue);
+  if (Traits::TableIcmp64[Index].C2 != Traits::Cond::Br_None)
+    _br(Traits::TableIcmp64[Index].C2, LabelFalse);
+  _cmp(Src0LoRM, Src1LoRI);
+  _br(Traits::TableIcmp64[Index].C3, LabelTrue);
+  Context.insert(LabelFalse);
+  _mov_nonkillable(Dest, Zero);
+  Context.insert(LabelTrue);
+}
+
 template <class Machine>
 void TargetX86Base<Machine>::lowerInsertElement(const InstInsertElement *Inst) {
   Operand *SourceVectNotLegalized = Inst->getSrc(0);
@@ -2848,7 +2926,7 @@
       return;
     }
     Variable *Dest = Instr->getDest();
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // Follow what GCC does and use a movq instead of what lowerLoad()
       // normally does (split the load into two).
       // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
@@ -2898,7 +2976,7 @@
     // Add a fence after the store to make it visible.
     Operand *Value = Instr->getArg(0);
     Operand *Ptr = Instr->getArg(1);
-    if (Value->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
       // Use a movq instead of what lowerStore() normally does
       // (split the store into two), following what GCC does.
       // Cast the bits from int -> to an xmm register first.
@@ -2922,7 +3000,7 @@
     Operand *Val = Instr->getArg(0);
     // In 32-bit mode, bswap only works on 32-bit arguments, and the
     // argument must be a register. Use rotate left for 16-bit bswap.
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       Val = legalizeUndef(Val);
       Variable *T_Lo = legalizeToReg(loOperand(Val));
       Variable *T_Hi = legalizeToReg(hiOperand(Val));
@@ -2932,7 +3010,8 @@
       _bswap(T_Hi);
       _mov(DestLo, T_Hi);
       _mov(DestHi, T_Lo);
-    } else if (Val->getType() == IceType_i32) {
+    } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
+               Val->getType() == IceType_i32) {
       Variable *T = legalizeToReg(Val);
       _bswap(T);
       _mov(Dest, T);
@@ -2949,11 +3028,28 @@
   }
   case Intrinsics::Ctpop: {
     Variable *Dest = Instr->getDest();
+    Variable *T = nullptr;
     Operand *Val = Instr->getArg(0);
-    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
-                                        ? H_call_ctpop_i32
-                                        : H_call_ctpop_i64,
-                                    Dest, 1);
+    Type ValTy = Val->getType();
+    assert(ValTy == IceType_i32 || ValTy == IceType_i64);
+
+    if (!Traits::Is64Bit) {
+      T = Dest;
+    } else {
+      T = makeReg(IceType_i64);
+      if (ValTy == IceType_i32) {
+        // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
+        // converting it to a 64-bit value, and using ctpop_i64. _movzx should
+        // ensure we will not have any bits set on Val's upper 32 bits.
+        Variable *V = makeReg(IceType_i64);
+        _movzx(V, Val);
+        Val = V;
+      }
+      ValTy = IceType_i64;
+    }
+
+    InstCall *Call = makeHelperCall(
+        ValTy == IceType_i32 ? H_call_ctpop_i32 : H_call_ctpop_i64, T, 1);
     Call->addArg(Val);
     lowerCall(Call);
     // The popcount helpers always return 32-bit values, while the intrinsic's
@@ -2961,10 +3057,33 @@
     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
     // the user doesn't do that in the IR. If the user does that in the IR,
     // then this zero'ing instruction is dead and gets optimized out.
-    if (Val->getType() == IceType_i64) {
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(DestHi, Zero);
+    if (!Traits::Is64Bit) {
+      assert(T == Dest);
+      if (Val->getType() == IceType_i64) {
+        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Constant *Zero = Ctx->getConstantZero(IceType_i32);
+        _mov(DestHi, Zero);
+      }
+    } else {
+      assert(Val->getType() == IceType_i64);
+      // T is 64 bit. It needs to be copied to dest. We need to:
+      //
+      // T_1.32 = trunc T.64 to i32
+      // T_2.64 = zext T_1.32 to i64
+      // Dest.<<right_size>> = T_2.<<right_size>>
+      //
+      // which ensures the upper 32 bits will always be cleared. Just doing a
+      //
+      // mov Dest.32 = trunc T.32 to i32
+      //
+      // is dangerous because there's a chance the compiler will optimize this
+      // copy out. To use _movzx we need two new registers (one 32-, and
+      // another 64-bit wide.)
+      Variable *T_1 = makeReg(IceType_i32);
+      _mov(T_1, T);
+      Variable *T_2 = makeReg(IceType_i64);
+      _movzx(T_2, T_1);
+      _mov(Dest, T_2);
     }
     return;
   }
@@ -2974,7 +3093,7 @@
     Operand *Val = legalize(Instr->getArg(0));
     Operand *FirstVal;
     Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       FirstVal = loOperand(Val);
       SecondVal = hiOperand(Val);
     } else {
@@ -2991,7 +3110,7 @@
     Operand *Val = legalize(Instr->getArg(0));
     Operand *FirstVal;
     Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
       FirstVal = hiOperand(Val);
       SecondVal = loOperand(Val);
     } else {
@@ -3099,7 +3218,7 @@
 void TargetX86Base<Machine>::lowerAtomicCmpxchg(Variable *DestPrev,
                                                 Operand *Ptr, Operand *Expected,
                                                 Operand *Desired) {
-  if (Expected->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Expected->getType() == IceType_i64) {
     // Reserve the pre-colored registers first, before adding any more
     // infinite-weight variables from formMemoryOperand's legalization.
     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
@@ -3217,7 +3336,7 @@
     Func->setError("Unknown AtomicRMW operation");
     return;
   case Intrinsics::AtomicAdd: {
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       // All the fall-through paths must set this to true, but use this
       // for asserting.
       NeedsCmpxchg = true;
@@ -3235,7 +3354,7 @@
     return;
   }
   case Intrinsics::AtomicSub: {
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       NeedsCmpxchg = true;
       Op_Lo = &TargetX86Base<Machine>::_sub;
       Op_Hi = &TargetX86Base<Machine>::_sbb;
@@ -3272,7 +3391,7 @@
     Op_Hi = &TargetX86Base<Machine>::_xor;
     break;
   case Intrinsics::AtomicExchange:
-    if (Dest->getType() == IceType_i64) {
+    if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
       NeedsCmpxchg = true;
       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
       // just need to be moved to the ecx and ebx registers.
@@ -3326,7 +3445,7 @@
   // If Op_{Lo,Hi} are nullptr, then just copy the value.
   Val = legalize(Val);
   Type Ty = Val->getType();
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
     typename Traits::X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
@@ -3458,7 +3577,7 @@
   if (!Cttz) {
     _xor(T_Dest, ThirtyOne);
   }
-  if (Ty == IceType_i32) {
+  if (Traits::Is64Bit || Ty == IceType_i32) {
     _mov(Dest, T_Dest);
     return;
   }
@@ -4138,7 +4257,7 @@
     std::swap(SrcT, SrcF);
     Cond = InstX86Base<Machine>::getOppositeCondition(Cond);
   }
-  if (DestTy == IceType_i64) {
+  if (!Traits::Is64Bit && DestTy == IceType_i64) {
     SrcT = legalizeUndef(SrcT);
     SrcF = legalizeUndef(SrcF);
     // Set the low portion.
@@ -4160,7 +4279,8 @@
     return;
   }
 
-  assert(DestTy == IceType_i16 || DestTy == IceType_i32);
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
+         (Traits::Is64Bit && DestTy == IceType_i64));
   Variable *T = nullptr;
   SrcF = legalize(SrcF);
   _mov(T, SrcF);
@@ -4177,7 +4297,7 @@
       formMemoryOperand(Addr, Value->getType());
   Type Ty = NewAddr->getType();
 
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Value = legalizeUndef(Value);
     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
@@ -4225,7 +4345,7 @@
                                                uint64_t Min, uint64_t Max) {
   // TODO(ascull): 64-bit should not reach here but only because it is not
   // implemented yet. This should be able to handle the 64-bit case.
-  assert(Comparison->getType() != IceType_i64);
+  assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
   // Subtracting 0 is a nop so don't do it
   if (Min != 0) {
     // Avoid clobbering the comparison by copying it
@@ -4324,7 +4444,7 @@
 
   assert(CaseClusters.size() != 0); // Should always be at least one
 
-  if (Src0->getType() == IceType_i64) {
+  if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
     Src0 = legalize(Src0); // get Base/Index into physical registers
     Operand *Src0Lo = loOperand(Src0);
     Operand *Src0Hi = hiOperand(Src0);
@@ -4529,7 +4649,7 @@
   Operand *Src = RMW->getData();
   Type Ty = Src->getType();
   typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
-  if (Ty == IceType_i64) {
+  if (!Traits::Is64Bit && Ty == IceType_i64) {
     Src = legalizeUndef(Src);
     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
@@ -4563,7 +4683,8 @@
       return;
     }
   } else {
-    // i8, i16, i32
+    // x86-32: i8, i16, i32
+    // x86-64: i8, i16, i32, i64
     switch (RMW->getOp()) {
     default:
       // TODO(stichnot): Implement other arithmetic operators.
@@ -4608,8 +4729,14 @@
 /// turned into zeroes, since loOperand() and hiOperand() don't expect
 /// Undef input.
 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() {
-  // Pause constant blinding or pooling, blinding or pooling will be done later
-  // during phi lowering assignments
+  if (Traits::Is64Bit) {
+    // On x86-64 we don't need to prelower phis -- the architecture can handle
+    // 64-bit integer natively.
+    return;
+  }
+
+  // Pause constant blinding or pooling, blinding or pooling will be done
+  // later during phi lowering assignments
   BoolFlagSaver B(RandomizationPoolingPaused, true);
   PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>(
       this, Context.getNode(), Func);
@@ -4770,6 +4897,16 @@
     // There should be no constants of vector type (other than undef).
     assert(!isVectorType(Ty));
 
+    // If the operand is a 64 bit constant integer we need to legalize it to a
+    // register in x86-64.
+    if (Traits::Is64Bit) {
+      if (llvm::isa<ConstantInteger64>(Const)) {
+        Variable *V = copyToReg(Const, RegNum);
+        V->setWeightInfinite();
+        return V;
+      }
+    }
+
     // If the operand is an 32 bit constant integer, we should check
     // whether we need to randomize it or pool it.
     if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
@@ -4907,7 +5044,7 @@
 template <class Machine>
 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for x86-32.
-  assert(Type != IceType_i64);
+  assert(Traits::Is64Bit || Type != IceType_i64);
   Variable *Reg = Func->makeVariable(Type);
   if (RegNum == Variable::NoRegister)
     Reg->setWeightInfinite();
@@ -4939,8 +5076,15 @@
 }
 
 template <class Machine>
-void TargetX86Base<Machine>::emit(const ConstantInteger64 *) const {
-  llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+void TargetX86Base<Machine>::emit(const ConstantInteger64 *C) const {
+  if (!Traits::Is64Bit) {
+    llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+  } else {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Ctx->getStrEmit();
+    Str << getConstantPrefix() << C->getValue();
+  }
 }
 
 template <class Machine>
@@ -5085,8 +5229,8 @@
                                           MemOperand->getBase(), Mask1);
         // If we have already assigned a physical register, we must come from
         // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
-        // the assigned register as this assignment is that start of its use-def
-        // chain. So we add RegNum argument here.
+        // the assigned register as this assignment is that start of its
+        // use-def chain. So we add RegNum argument here.
         Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
         _lea(RegTemp, TempMemOperand);
         // As source operand doesn't use the dstreg, we don't need to add
diff --git a/unittest/AssemblerX8632/DataMov.cpp b/unittest/AssemblerX8632/DataMov.cpp
index cb2012e..d41acd1 100644
--- a/unittest/AssemblerX8632/DataMov.cpp
+++ b/unittest/AssemblerX8632/DataMov.cpp
@@ -538,7 +538,8 @@
                                                                                \
     __ mov(IceType_i32, GPRRegister::Encoded_Reg_##Src, Immediate(Value));     \
     __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));   \
-    __ movd(XmmRegister::Encoded_Reg_##Dst, GPRRegister::Encoded_Reg_##Src);   \
+    __ movd(IceType_i32, XmmRegister::Encoded_Reg_##Dst,                       \
+            GPRRegister::Encoded_Reg_##Src);                                   \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -560,7 +561,7 @@
     const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
                                                                                \
     __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
-    __ movd(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));                 \
+    __ movd(IceType_i32, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));    \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -609,7 +610,8 @@
     const uint32_t V0 = Value;                                                 \
                                                                                \
     __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
-    __ movd(GPRRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src);   \
+    __ movd(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                       \
+            XmmRegister::Encoded_Reg_##Src);                                   \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -631,7 +633,7 @@
     const uint32_t V1 = ~(Value);                                              \
                                                                                \
     __ movss(IceType_f64, XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));   \
-    __ movd(dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);                 \
+    __ movd(IceType_i32, dwordAddress(T1), XmmRegister::Encoded_Reg_##Src);    \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index 45ff3a9..a85c8f9 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1072,7 +1072,7 @@
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
     __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
            Immediate(Inst##Size##SrcValue));                                   \
-    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst, IceType_i32, \
                  GPRRegister::Encoded_Reg_##GPR);                              \
                                                                                \
     AssembledTest test = assemble();                                           \
@@ -1092,7 +1092,7 @@
     __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
            Immediate(Inst##Size##DstValue));                                   \
     __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T0));               \
-    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+    __ cvt##Inst(IceType_i32, GPRRegister::Encoded_Reg_##GPR, IceType_f##Size, \
                  XmmRegister::Encoded_Reg_##Src);                              \
                                                                                \
     AssembledTest test = assemble();                                           \
@@ -1132,7 +1132,7 @@
     const uint32_t T1 = allocateDword();                                       \
                                                                                \
     __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
-    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst,              \
+    __ cvt##Inst(IceType_f##Size, XmmRegister::Encoded_Reg_##Dst, IceType_i32, \
                  dwordAddress(T1));                                            \
                                                                                \
     AssembledTest test = assemble();                                           \
@@ -1152,7 +1152,7 @@
                                                                                \
     __ mov(IceType_i32, GPRRegister::Encoded_Reg_##GPR,                        \
            Immediate(Inst##Size##DstValue));                                   \
-    __ cvt##Inst(IceType_f##Size, GPRRegister::Encoded_Reg_##GPR,              \
+    __ cvt##Inst(IceType_i32, GPRRegister::Encoded_Reg_##GPR, IceType_f##Size, \
                  dwordAddress(T0));                                            \
                                                                                \
     AssembledTest test = assemble();                                           \
diff --git a/unittest/AssemblerX8664/DataMov.cpp b/unittest/AssemblerX8664/DataMov.cpp
index 0610b45..6e83fce 100644
--- a/unittest/AssemblerX8664/DataMov.cpp
+++ b/unittest/AssemblerX8664/DataMov.cpp
@@ -263,6 +263,32 @@
 #undef TestRegAddr
 }
 
+TEST_F(AssemblerX8664Test, Movabs) {
+#define TestImplValue(Dst, Value)                                              \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Value ")";             \
+    uint64_t V = (Value);                                                      \
+    __ movabs(Encoded_GPR_##Dst##q(), V);                                      \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V, test.DST()) << TestString;                                    \
+  } while (0)
+
+#define TestImpl(Dst)                                                          \
+  do {                                                                         \
+    for (uint64_t V = {0, 1, 0xFFFFFFull, 0x80000000ull,                       \
+                       0xFFFFFFFFFFFFFFFFull}) {                               \
+      TestImpl(Dst, V);                                                        \
+    }                                                                          \
+  } while (0)
+
+#undef TestImpl
+#undef TestImplValue
+}
+
 TEST_F(AssemblerX8664Test, Movzx) {
   static constexpr uint32_t Mask8 = 0x000000FF;
   static constexpr uint32_t Mask16 = 0x0000FFFF;
@@ -677,7 +703,7 @@
 }
 
 TEST_F(AssemblerX8664Test, MovdToXmm) {
-#define TestMovdXmmReg(Src, Dst, Value)                                        \
+#define TestMovdXmmReg32(Src, Dst, Value)                                      \
   do {                                                                         \
     assert(((Value)&0xFFFFFFFF) == (Value));                                   \
     static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
@@ -686,7 +712,7 @@
                                                                                \
     __ mov(IceType_i32, Encoded_GPR_##Src(), Immediate(Value));                \
     __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T0));              \
-    __ movd(Encoded_Xmm_##Dst(), Encoded_GPR_##Src());                         \
+    __ movd(IceType_i32, Encoded_Xmm_##Dst(), Encoded_GPR_##Src());            \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -698,7 +724,35 @@
     reset();                                                                   \
   } while (0)
 
-#define TestMovdXmmAddr(Dst, Value)                                            \
+#define TestMovdXmmReg64(Src, Dst, Value)                                      \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = 0xFFFFFFFF00000000ull;                                 \
+    const uint64_t Expected = (static_cast<uint64_t>(Value) << 32) | (Value);  \
+                                                                               \
+    __ movabs(Encoded_GPR_##Src(), Expected);                                  \
+    __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T0));              \
+    __ movd(IceType_i64, Encoded_Xmm_##Dst(), Encoded_GPR_##Src());            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<uint64_t>()) << TestString << " value is "    \
+                                              << Value;                        \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdXmmReg(Src, Dst, Value)                                        \
+  do {                                                                         \
+    TestMovdXmmReg32(Src, Dst, Value);                                         \
+    TestMovdXmmReg64(Src, Dst, Value);                                         \
+  } while (0)
+
+#define TestMovdXmmAddr32(Dst, Value)                                          \
   do {                                                                         \
     assert(((Value)&0xFFFFFFFF) == (Value));                                   \
     static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
@@ -708,7 +762,7 @@
     const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
                                                                                \
     __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T1));              \
-    __ movd(Encoded_Xmm_##Dst(), dwordAddress(T0));                            \
+    __ movd(IceType_i32, Encoded_Xmm_##Dst(), dwordAddress(T0));               \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -721,6 +775,35 @@
     reset();                                                                   \
   } while (0)
 
+#define TestMovdXmmAddr64(Dst, Value)                                          \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Dst ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint32_t V0 = (static_cast<uint64_t>(Value) << 32) | (Value);        \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = 0xFFFFFFFF00000000ull;                                 \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Dst(), dwordAddress(T1));              \
+    __ movd(IceType_i64, Encoded_Xmm_##Dst(), dwordAddress(T0));               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setDwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Value, test.Dst<uint64_t>()) << TestString << " value is "       \
+                                           << Value;                           \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdXmmAddr(Dst, Value)                                            \
+  do {                                                                         \
+    TestMovdXmmAddr32(Dst, Value);                                             \
+    TestMovdXmmAddr64(Dst, Value);                                             \
+  } while (0)
+
 #define TestMovd(Dst)                                                          \
   do {                                                                         \
     for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
@@ -759,13 +842,17 @@
   TestMovd(xmm14);
   TestMovd(xmm15);
 
-#undef TestMovdXmmAddr
-#undef TestMovdXmmReg
 #undef TestMovd
+#undef TestMovdXmmAddr
+#undef TestMovdXmmAddr64
+#undef TestMovdXmmAddr32
+#undef TestMovdXmmReg
+#undef TestMovdXmmReg64
+#undef TestMovdXmmReg32
 }
 
 TEST_F(AssemblerX8664Test, MovdFromXmm) {
-#define TestMovdRegXmm(Src, Dst, Value)                                        \
+#define TestMovdRegXmm32(Src, Dst, Value)                                      \
   do {                                                                         \
     assert(((Value)&0xFFFFFFFF) == (Value));                                   \
     static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
@@ -773,7 +860,7 @@
     const uint32_t V0 = Value;                                                 \
                                                                                \
     __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
-    __ movd(Encoded_GPR_##Dst(), Encoded_Xmm_##Src());                         \
+    __ movd(IceType_i32, Encoded_GPR_##Dst(), Encoded_Xmm_##Src());            \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -785,7 +872,33 @@
     reset();                                                                   \
   } while (0)
 
-#define TestMovdAddrXmm(Src, Value)                                            \
+#define TestMovdRegXmm64(Src, Dst, Value)                                      \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", " #Dst ")";               \
+    const uint32_t T0 = allocateDword();                                       \
+    const uint64_t V0 = (static_cast<uint64_t>(Value) << 32) | (Value);        \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
+    __ movd(IceType_i64, Encoded_GPR_##Dst(), Encoded_Xmm_##Src());            \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.contentsOfQword(T0)) << TestString << " value is "      \
+                                            << Value;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdRegXmm(Src, Dst, Value)                                        \
+  do {                                                                         \
+    TestMovdRegXmm32(Src, Dst, Value);                                         \
+    TestMovdRegXmm64(Src, Dst, Value);                                         \
+  } while (0)
+
+#define TestMovdAddrXmm32(Src, Value)                                          \
   do {                                                                         \
     assert(((Value)&0xFFFFFFFF) == (Value));                                   \
     static constexpr char TestString[] = "(" #Src ", Addr)";                   \
@@ -795,7 +908,7 @@
     const uint32_t V1 = ~(Value);                                              \
                                                                                \
     __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
-    __ movd(dwordAddress(T1), Encoded_Xmm_##Src());                            \
+    __ movd(IceType_i32, dwordAddress(T1), Encoded_Xmm_##Src());               \
                                                                                \
     AssembledTest test = assemble();                                           \
                                                                                \
@@ -808,6 +921,35 @@
     reset();                                                                   \
   } while (0)
 
+#define TestMovdAddrXmm64(Src, Value)                                          \
+  do {                                                                         \
+    assert(((Value)&0xFFFFFFFF) == (Value));                                   \
+    static constexpr char TestString[] = "(" #Src ", Addr)";                   \
+    const uint32_t T0 = allocateQword();                                       \
+    const uint64_t V0 = (static_cast<uint64_t>(Value) << 32) | Value;          \
+    const uint32_t T1 = allocateQword();                                       \
+    const uint64_t V1 = ~V0;                                                   \
+                                                                               \
+    __ movss(IceType_f64, Encoded_Xmm_##Src(), dwordAddress(T0));              \
+    __ movd(IceType_i64, dwordAddress(T1), Encoded_Xmm_##Src());               \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+                                                                               \
+    test.setQwordTo(T0, V0);                                                   \
+    test.setQwordTo(T1, V1);                                                   \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(V0, test.contentsOfQword(T1)) << TestString << " value is "      \
+                                            << Value;                          \
+    reset();                                                                   \
+  } while (0)
+
+#define TestMovdAddrXmm(Src, Value)                                            \
+  do {                                                                         \
+    TestMovdAddrXmm32(Src, Value);                                             \
+    TestMovdAddrXmm64(Src, Value);                                             \
+  } while (0)
+
 #define TestMovd(Src)                                                          \
   do {                                                                         \
     for (uint32_t Value : {0u, 1u, 0x7FFFFFFFu, 0x80000000u, 0xFFFFFFFFu}) {   \
@@ -846,9 +988,13 @@
   TestMovd(xmm14);
   TestMovd(xmm15);
 
-#undef TestMovdAddrXmm
-#undef TestMovdRegXmm
 #undef TestMovd
+#undef TestMovdAddrXmm
+#undef TestMovdAddrXmm64
+#undef TestMovdAddrXmm32
+#undef TestMovdRegXmm
+#undef TestMovdRegXmm64
+#undef TestMovdRegXmm32
 }
 
 TEST_F(AssemblerX8664Test, MovqXmmAddr) {
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index ac51c02..e43413a 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1104,15 +1104,16 @@
     reset();                                                                   \
   } while (0)
 
-#define TestImplSXmmReg(Dst, GPR, Inst, Size)                                  \
+#define TestImplSXmmReg(Dst, GPR, Inst, Size, IntType)                         \
   do {                                                                         \
     static constexpr char TestString[] =                                       \
-        "(" #Dst ", " #GPR ", cvt" #Inst ", f" #Size ")";                      \
+        "(" #Dst ", " #GPR ", cvt" #Inst ", " #IntType ", f" #Size ")";        \
     const uint32_t T0 = allocateDqword();                                      \
                                                                                \
     __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
     __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##SrcValue)); \
-    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), Encoded_GPR_##GPR());   \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), IntType,                \
+                 Encoded_GPR_##GPR());                                         \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, Inst##Size##DstValue);                                \
@@ -1122,21 +1123,23 @@
     reset();                                                                   \
   } while (0)
 
-#define TestImplSRegXmm(GPR, Src, Inst, Size)                                  \
+#define TestImplSRegXmm(GPR, Src, Inst, IntSize, Size)                         \
   do {                                                                         \
     static constexpr char TestString[] =                                       \
-        "(" #GPR ", " #Src ", cvt" #Inst ", f" #Size ")";                      \
+        "(" #GPR ", " #Src ", cvt" #Inst ", " #IntSize ", f" #Size ")";        \
     const uint32_t T0 = allocateDqword();                                      \
                                                                                \
     __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
     __ movups(Encoded_Xmm_##Src(), dwordAddress(T0));                          \
-    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), Encoded_Xmm_##Src());   \
+    __ cvt##Inst(IceType_i##IntSize, Encoded_GPR_##GPR(), IceType_f##Size,     \
+                 Encoded_Xmm_##Src());                                         \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+    ASSERT_EQ(static_cast<uint##IntSize##_t>(Inst##Size##Expected),            \
+              test.GPR())                                                      \
         << TestString;                                                         \
     reset();                                                                   \
   } while (0)
@@ -1160,15 +1163,16 @@
     reset();                                                                   \
   } while (0)
 
-#define TestImplSXmmAddr(Dst, Inst, Size)                                      \
+#define TestImplSXmmAddr(Dst, Inst, Size, IntType)                             \
   do {                                                                         \
     static constexpr char TestString[] =                                       \
-        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ")";                          \
+        "(" #Dst ", Addr, cvt" #Inst ", f" #Size ", " #IntType ")";            \
     const uint32_t T0 = allocateDqword();                                      \
     const uint32_t T1 = allocateDword();                                       \
                                                                                \
     __ movups(Encoded_Xmm_##Dst(), dwordAddress(T0));                          \
-    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), dwordAddress(T1));      \
+    __ cvt##Inst(IceType_f##Size, Encoded_Xmm_##Dst(), IntType,                \
+                 dwordAddress(T1));                                            \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, Inst##Size##DstValue);                                \
@@ -1179,20 +1183,22 @@
     reset();                                                                   \
   } while (0)
 
-#define TestImplSRegAddr(GPR, Inst, Size)                                      \
+#define TestImplSRegAddr(GPR, Inst, IntSize, Size)                             \
   do {                                                                         \
     static constexpr char TestString[] =                                       \
-        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ")";                          \
+        "(" #GPR ", Addr, cvt" #Inst ", f" #Size ", " #IntSize ")";            \
     const uint32_t T0 = allocateDqword();                                      \
                                                                                \
     __ mov(IceType_i32, Encoded_GPR_##GPR(), Immediate(Inst##Size##DstValue)); \
-    __ cvt##Inst(IceType_f##Size, Encoded_GPR_##GPR(), dwordAddress(T0));      \
+    __ cvt##Inst(IceType_i##IntSize, Encoded_GPR_##GPR(), IceType_f##Size,     \
+                 dwordAddress(T0));                                            \
                                                                                \
     AssembledTest test = assemble();                                           \
     test.setDqwordTo(T0, Inst##Size##SrcValue);                                \
     test.run();                                                                \
                                                                                \
-    ASSERT_EQ(static_cast<uint32_t>(Inst##Size##Expected), test.GPR())         \
+    ASSERT_EQ(static_cast<uint##IntSize##_t>(Inst##Size##Expected),            \
+              test.GPR())                                                      \
         << TestString;                                                         \
     reset();                                                                   \
   } while (0)
@@ -1203,10 +1209,14 @@
     TestImplPXmmAddr(Src, dq2ps, Size);                                        \
     TestImplPXmmXmm(Dst, Src, tps2dq, Size);                                   \
     TestImplPXmmAddr(Src, tps2dq, Size);                                       \
-    TestImplSXmmReg(Dst, GPR, si2ss, Size);                                    \
-    TestImplSXmmAddr(Dst, si2ss, Size);                                        \
-    TestImplSRegXmm(GPR, Src, tss2si, Size);                                   \
-    TestImplSRegAddr(GPR, tss2si, Size);                                       \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size, IceType_i32);                       \
+    TestImplSXmmReg(Dst, GPR, si2ss, Size, IceType_i64);                       \
+    TestImplSXmmAddr(Dst, si2ss, Size, IceType_i32);                           \
+    TestImplSXmmAddr(Dst, si2ss, Size, IceType_i64);                           \
+    TestImplSRegXmm(GPR, Src, tss2si, 32, Size);                               \
+    TestImplSRegXmm(GPR, Src, tss2si, 64, Size);                               \
+    TestImplSRegAddr(GPR, tss2si, 32, Size);                                   \
+    TestImplSRegAddr(GPR, tss2si, 64, Size);                                   \
     TestImplPXmmXmm(Dst, Src, float2float, Size);                              \
     TestImplPXmmAddr(Src, float2float, Size);                                  \
   } while (0)