Extracts an TargetX86Base target which will be used as the common X86{32,64} implementation. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4077 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1202533003.

commit: 7e93c62d7e223b7fd9e6e0889e4b70b635589282 [log] [tgz]
author: John Porto <jpp@chromium.org> Tue Jun 23 10:58:57 2015 -0700
committer: John Porto <jpp@chromium.org> Tue Jun 23 10:58:57 2015 -0700
tree: 0f0ef587838730d47f8aefd4d540f898f0d88d40
parent: e587d9494fa95920943fafa0f1dac470a298a251 [diff]
diff --git a/Makefile.standalone b/Makefile.standalone
index 76b1d14..80794e4 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone

@@ -199,8 +199,8 @@
 	IceRNG.cpp \
 	IceTargetLowering.cpp \
 	IceTargetLoweringARM32.cpp \
-	IceTargetLoweringX8632.cpp \
 	IceTargetLoweringMIPS32.cpp \
+	IceTargetLoweringX8632.cpp \
 	IceThreading.cpp \
 	IceTimerTree.cpp \
 	IceTranslator.cpp \

diff --git a/src/IceAssemblerX8664.h b/src/IceAssemblerX8664.h
new file mode 100644
index 0000000..3ed52a8
--- /dev/null
+++ b/src/IceAssemblerX8664.h

@@ -0,0 +1,63 @@
+//===- subzero/src/IceAssemberX8664.h - Assembler for x86-64 ----*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Assembler class for x86-64.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEASSEMBLERX8664_H
+#define SUBZERO_SRC_ICEASSEMBLERX8664_H
+
+#include "IceAssembler.h"
+
+namespace Ice {
+namespace X8664 {
+
+class AssemblerX8664 final : public Assembler {
+  AssemblerX8664(const AssemblerX8664 &) = delete;
+  AssemblerX8664 &operator=(const AssemblerX8664 &) = delete;
+
+public:
+  explicit AssemblerX8664(bool use_far_branches = false) : Assembler() {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  void alignFunction() override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  void padWithNop(intptr_t Padding) override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  SizeT getBundleAlignLog2Bytes() const override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  const char *getNonExecPadDirective() const override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  void bindCfgNodeLabel(SizeT NodeNumber) override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+
+  bool fixupIsPCRel(FixupKind Kind) const override {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+};
+
+} // end of namespace X8664
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEASSEMBLERX8664_H

diff --git a/src/IceInst.h b/src/IceInst.h
index 2b5adc6..7aad054 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h

@@ -16,6 +16,7 @@
 #ifndef SUBZERO_SRC_ICEINST_H
 #define SUBZERO_SRC_ICEINST_H
 
+#include "IceCfg.h"
 #include "IceDefs.h"
 #include "IceInst.def"
 #include "IceIntrinsics.h"

diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 62c459b..7c93463 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp

@@ -17,6 +17,7 @@
 
 #include "IceAssemblerARM32.h"
 #include "IceAssemblerX8632.h"
+#include "IceAssemblerX8664.h"
 #include "assembler_mips32.h"
 #include "IceCfg.h" // setError()
 #include "IceCfgNode.h"
@@ -27,6 +28,7 @@
 #include "IceTargetLoweringARM32.h"
 #include "IceTargetLoweringMIPS32.h"
 #include "IceTargetLoweringX8632.h"
+#include "IceTargetLoweringX8664.h"
 
 namespace Ice {
 

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index c1ba404..55a0bfc 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -2,9 +2,6 @@
 //
 //                        The Subzero Code Generator
 //
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the TargetLoweringX8632 class, which
@@ -13,133 +10,309 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/MathExtras.h"
-
-#include "IceCfg.h"
-#include "IceCfgNode.h"
-#include "IceClFlags.h"
-#include "IceDefs.h"
-#include "IceELFObjectWriter.h"
-#include "IceGlobalInits.h"
-#include "IceInstX8632.h"
-#include "IceLiveness.h"
-#include "IceOperand.h"
-#include "IceRegistersX8632.h"
-#include "IceTargetLoweringX8632.def"
 #include "IceTargetLoweringX8632.h"
-#include "IceUtils.h"
+
+#include "IceTargetLoweringX86Base.h"
 
 namespace Ice {
+namespace X86Internal {
+template <> struct MachineTraits<TargetX8632> {
+  using InstructionSet = TargetX8632::X86InstructionSet;
 
-namespace {
+  // The following table summarizes the logic for lowering the fcmp
+  // instruction.  There is one table entry for each of the 16 conditions.
+  //
+  // The first four columns describe the case when the operands are
+  // floating point scalar values.  A comment in lowerFcmp() describes the
+  // lowering template.  In the most general case, there is a compare
+  // followed by two conditional branches, because some fcmp conditions
+  // don't map to a single x86 conditional branch.  However, in many cases
+  // it is possible to swap the operands in the comparison and have a
+  // single conditional branch.  Since it's quite tedious to validate the
+  // table by hand, good execution tests are helpful.
+  //
+  // The last two columns describe the case when the operands are vectors
+  // of floating point values.  For most fcmp conditions, there is a clear
+  // mapping to a single x86 cmpps instruction variant.  Some fcmp
+  // conditions require special code to handle and these are marked in the
+  // table with a Cmpps_Invalid predicate.
+  static const struct TableFcmpType {
+    uint32_t Default;
+    bool SwapScalarOperands;
+    CondX86::BrCond C1, C2;
+    bool SwapVectorOperands;
+    CondX86::CmppsCond Predicate;
+  } TableFcmp[];
+  static const size_t TableFcmpSize;
 
-// The following table summarizes the logic for lowering the fcmp
-// instruction.  There is one table entry for each of the 16 conditions.
-//
-// The first four columns describe the case when the operands are
-// floating point scalar values.  A comment in lowerFcmp() describes the
-// lowering template.  In the most general case, there is a compare
-// followed by two conditional branches, because some fcmp conditions
-// don't map to a single x86 conditional branch.  However, in many cases
-// it is possible to swap the operands in the comparison and have a
-// single conditional branch.  Since it's quite tedious to validate the
-// table by hand, good execution tests are helpful.
-//
-// The last two columns describe the case when the operands are vectors
-// of floating point values.  For most fcmp conditions, there is a clear
-// mapping to a single x86 cmpps instruction variant.  Some fcmp
-// conditions require special code to handle and these are marked in the
-// table with a Cmpps_Invalid predicate.
-const struct TableFcmp_ {
-  uint32_t Default;
-  bool SwapScalarOperands;
-  CondX86::BrCond C1, C2;
-  bool SwapVectorOperands;
-  CondX86::CmppsCond Predicate;
-} TableFcmp[] = {
+  // The following table summarizes the logic for lowering the icmp instruction
+  // for i32 and narrower types.  Each icmp condition has a clear mapping to an
+  // x86 conditional branch instruction.
+
+  static const struct TableIcmp32Type {
+    CondX86::BrCond Mapping;
+  } TableIcmp32[];
+  static const size_t TableIcmp32Size;
+
+  // The following table summarizes the logic for lowering the icmp instruction
+  // for the i64 type.  For Eq and Ne, two separate 32-bit comparisons and
+  // conditional branches are needed.  For the other conditions, three separate
+  // conditional branches are needed.
+  static const struct TableIcmp64Type {
+    CondX86::BrCond C1, C2, C3;
+  } TableIcmp64[];
+  static const size_t TableIcmp64Size;
+
+  static CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
+    size_t Index = static_cast<size_t>(Cond);
+    assert(Index < TableIcmp32Size);
+    return TableIcmp32[Index].Mapping;
+  }
+
+  static const struct TableTypeX8632AttributesType {
+    Type InVectorElementType;
+  } TableTypeX8632Attributes[];
+  static const size_t TableTypeX8632AttributesSize;
+
+  // Return the type which the elements of the vector have in the X86
+  // representation of the vector.
+  static Type getInVectorElementType(Type Ty) {
+    assert(isVectorType(Ty));
+    size_t Index = static_cast<size_t>(Ty);
+    (void)Index;
+    assert(Index < TableTypeX8632AttributesSize);
+    return TableTypeX8632Attributes[Ty].InVectorElementType;
+  }
+
+  // The maximum number of arguments to pass in XMM registers
+  static constexpr uint32_t X86_MAX_XMM_ARGS = 4;
+  // The number of bits in a byte
+  static constexpr uint32_t X86_CHAR_BIT = 8;
+  // Stack alignment
+  static const uint32_t X86_STACK_ALIGNMENT_BYTES;
+  // Size of the return address on the stack
+  static constexpr uint32_t X86_RET_IP_SIZE_BYTES = 4;
+  // The number of different NOP instructions
+  static constexpr uint32_t X86_NUM_NOP_VARIANTS = 5;
+
+  // Value is in bytes. Return Value adjusted to the next highest multiple
+  // of the stack alignment.
+  static uint32_t applyStackAlignment(uint32_t Value) {
+    return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
+  }
+};
+
+const MachineTraits<TargetX8632>::TableFcmpType
+    MachineTraits<TargetX8632>::TableFcmp[] = {
 #define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
   { dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred }              \
   ,
-    FCMPX8632_TABLE
+        FCMPX8632_TABLE
 #undef X
 };
-const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
 
-// The following table summarizes the logic for lowering the icmp instruction
-// for i32 and narrower types.  Each icmp condition has a clear mapping to an
-// x86 conditional branch instruction.
+constexpr size_t MachineTraits<TargetX8632>::TableFcmpSize =
+    llvm::array_lengthof(TableFcmp);
 
-const struct TableIcmp32_ {
-  CondX86::BrCond Mapping;
-} TableIcmp32[] = {
+const MachineTraits<TargetX8632>::TableIcmp32Type
+    MachineTraits<TargetX8632>::TableIcmp32[] = {
 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
   { CondX86::C_32 }                                                            \
   ,
-    ICMPX8632_TABLE
+        ICMPX8632_TABLE
 #undef X
 };
-const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
 
-// The following table summarizes the logic for lowering the icmp instruction
-// for the i64 type.  For Eq and Ne, two separate 32-bit comparisons and
-// conditional branches are needed.  For the other conditions, three separate
-// conditional branches are needed.
-const struct TableIcmp64_ {
-  CondX86::BrCond C1, C2, C3;
-} TableIcmp64[] = {
+constexpr size_t MachineTraits<TargetX8632>::TableIcmp32Size =
+    llvm::array_lengthof(TableIcmp32);
+
+const MachineTraits<TargetX8632>::TableIcmp64Type
+    MachineTraits<TargetX8632>::TableIcmp64[] = {
 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
   { CondX86::C1_64, CondX86::C2_64, CondX86::C3_64 }                           \
   ,
-    ICMPX8632_TABLE
+        ICMPX8632_TABLE
 #undef X
 };
-const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
 
-CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
-  size_t Index = static_cast<size_t>(Cond);
-  assert(Index < TableIcmp32Size);
-  return TableIcmp32[Index].Mapping;
-}
+constexpr size_t MachineTraits<TargetX8632>::TableIcmp64Size =
+    llvm::array_lengthof(TableIcmp64);
 
-const struct TableTypeX8632Attributes_ {
-  Type InVectorElementType;
-} TableTypeX8632Attributes[] = {
+const MachineTraits<TargetX8632>::TableTypeX8632AttributesType
+    MachineTraits<TargetX8632>::TableTypeX8632Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
   { elementty }                                                                \
   ,
-    ICETYPEX8632_TABLE
+        ICETYPEX8632_TABLE
 #undef X
 };
-const size_t TableTypeX8632AttributesSize =
+
+constexpr size_t MachineTraits<TargetX8632>::TableTypeX8632AttributesSize =
     llvm::array_lengthof(TableTypeX8632Attributes);
 
-// Return the type which the elements of the vector have in the X86
-// representation of the vector.
-Type getInVectorElementType(Type Ty) {
-  assert(isVectorType(Ty));
-  size_t Index = static_cast<size_t>(Ty);
-  (void)Index;
-  assert(Index < TableTypeX8632AttributesSize);
-  return TableTypeX8632Attributes[Ty].InVectorElementType;
+const uint32_t MachineTraits<TargetX8632>::X86_STACK_ALIGNMENT_BYTES = 16;
+} // end of namespace X86Internal
+
+TargetX8632 *TargetX8632::create(Cfg *Func) {
+  return X86Internal::TargetX86Base<TargetX8632>::create(Func);
 }
 
-// The maximum number of arguments to pass in XMM registers
-const uint32_t X86_MAX_XMM_ARGS = 4;
-// The number of bits in a byte
-const uint32_t X86_CHAR_BIT = 8;
-// Stack alignment
-const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
-// Size of the return address on the stack
-const uint32_t X86_RET_IP_SIZE_BYTES = 4;
-// The number of different NOP instructions
-const uint32_t X86_NUM_NOP_VARIANTS = 5;
+TargetDataX8632::TargetDataX8632(GlobalContext *Ctx)
+    : TargetDataLowering(Ctx) {}
 
-// Value is in bytes. Return Value adjusted to the next highest multiple
-// of the stack alignment.
-uint32_t applyStackAlignment(uint32_t Value) {
-  return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
+namespace {
+template <typename T> struct PoolTypeConverter {};
+
+template <> struct PoolTypeConverter<float> {
+  typedef uint32_t PrimitiveIntType;
+  typedef ConstantFloat IceType;
+  static const Type Ty = IceType_f32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+const char *PoolTypeConverter<float>::TypeName = "float";
+const char *PoolTypeConverter<float>::AsmTag = ".long";
+const char *PoolTypeConverter<float>::PrintfString = "0x%x";
+
+template <> struct PoolTypeConverter<double> {
+  typedef uint64_t PrimitiveIntType;
+  typedef ConstantDouble IceType;
+  static const Type Ty = IceType_f64;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+const char *PoolTypeConverter<double>::TypeName = "double";
+const char *PoolTypeConverter<double>::AsmTag = ".quad";
+const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint32_t> {
+  typedef uint32_t PrimitiveIntType;
+  typedef ConstantInteger32 IceType;
+  static const Type Ty = IceType_i32;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+const char *PoolTypeConverter<uint32_t>::TypeName = "i32";
+const char *PoolTypeConverter<uint32_t>::AsmTag = ".long";
+const char *PoolTypeConverter<uint32_t>::PrintfString = "0x%x";
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint16_t> {
+  typedef uint32_t PrimitiveIntType;
+  typedef ConstantInteger32 IceType;
+  static const Type Ty = IceType_i16;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+const char *PoolTypeConverter<uint16_t>::TypeName = "i16";
+const char *PoolTypeConverter<uint16_t>::AsmTag = ".short";
+const char *PoolTypeConverter<uint16_t>::PrintfString = "0x%x";
+
+// Add converter for int type constant pooling
+template <> struct PoolTypeConverter<uint8_t> {
+  typedef uint32_t PrimitiveIntType;
+  typedef ConstantInteger32 IceType;
+  static const Type Ty = IceType_i8;
+  static const char *TypeName;
+  static const char *AsmTag;
+  static const char *PrintfString;
+};
+const char *PoolTypeConverter<uint8_t>::TypeName = "i8";
+const char *PoolTypeConverter<uint8_t>::AsmTag = ".byte";
+const char *PoolTypeConverter<uint8_t>::PrintfString = "0x%x";
+} // end of anonymous namespace
+
+template <typename T>
+void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Type Ty = T::Ty;
+  SizeT Align = typeAlignInBytes(Ty);
+  ConstantList Pool = Ctx->getConstantPool(Ty);
+
+  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
+      << "\n";
+  Str << "\t.align\t" << Align << "\n";
+  for (Constant *C : Pool) {
+    if (!C->getShouldBePooled())
+      continue;
+    typename T::IceType *Const = llvm::cast<typename T::IceType>(C);
+    typename T::IceType::PrimType Value = Const->getValue();
+    // Use memcpy() to copy bits from Value into RawValue in a way
+    // that avoids breaking strict-aliasing rules.
+    typename T::PrimitiveIntType RawValue;
+    memcpy(&RawValue, &Value, sizeof(Value));
+    char buf[30];
+    int CharsPrinted =
+        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
+    assert(CharsPrinted >= 0 &&
+           (size_t)CharsPrinted < llvm::array_lengthof(buf));
+    (void)CharsPrinted; // avoid warnings if asserts are disabled
+    Const->emitPoolLabel(Str);
+    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t# " << T::TypeName << " "
+        << Value << "\n";
+  }
 }
 
+void TargetDataX8632::lowerConstants() {
+  if (Ctx->getFlags().getDisableTranslation())
+    return;
+  // No need to emit constants from the int pool since (for x86) they
+  // are embedded as immediates in the instructions, just emit float/double.
+  switch (Ctx->getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
+    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
+
+    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
+    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    OstreamLocker L(Ctx);
+
+    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
+    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
+
+    emitConstantPool<PoolTypeConverter<float>>(Ctx);
+    emitConstantPool<PoolTypeConverter<double>>(Ctx);
+  } break;
+  }
+}
+
+void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
+                                   const IceString &SectionSuffix) {
+  switch (Ctx->getFlags().getOutFileType()) {
+  case FT_Elf: {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
+  } break;
+  case FT_Asm:
+  case FT_Iasm: {
+    const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
+    OstreamLocker L(Ctx);
+    for (const VariableDeclaration *Var : Vars) {
+      if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
+        emitGlobal(*Var, SectionSuffix);
+      }
+    }
+  } break;
+  }
+}
+
+TargetHeaderX8632::TargetHeaderX8632(GlobalContext *Ctx)
+    : TargetHeaderLowering(Ctx) {}
+
 // In some cases, there are x-macros tables for both high-level and
 // low-level instructions/operands that use the same enum key value.
 // The tables are kept separate to maintain a proper separation
@@ -148,6 +321,7 @@
 // added or deleted.  The following dummy namespaces use
 // static_asserts to ensure everything is kept in sync.
 
+namespace {
 // Validate the enum values in FCMPX8632_TABLE.
 namespace dummy1 {
 // Define a temporary set of enum values based on low-level table
@@ -245,5476 +419,6 @@
 ICETYPE_TABLE
 #undef X
 } // end of namespace dummy3
-
-// A helper class to ease the settings of RandomizationPoolingPause
-// to disable constant blinding or pooling for some translation phases.
-class BoolFlagSaver {
-  BoolFlagSaver() = delete;
-  BoolFlagSaver(const BoolFlagSaver &) = delete;
-  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
-
-public:
-  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
-  ~BoolFlagSaver() { Flag = OldValue; }
-
-private:
-  const bool OldValue;
-  bool &Flag;
-};
-
 } // end of anonymous namespace
 
-BoolFoldingEntry::BoolFoldingEntry(Inst *I)
-    : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
-
-BoolFolding::BoolFoldingProducerKind
-BoolFolding::getProducerKind(const Inst *Instr) {
-  if (llvm::isa<InstIcmp>(Instr)) {
-    if (Instr->getSrc(0)->getType() != IceType_i64)
-      return PK_Icmp32;
-    return PK_None; // TODO(stichnot): actually PK_Icmp64;
-  }
-  return PK_None; // TODO(stichnot): remove this
-
-  if (llvm::isa<InstFcmp>(Instr))
-    return PK_Fcmp;
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return PK_None;
-    case InstCast::Trunc:
-      return PK_Trunc;
-    }
-  }
-  return PK_None;
-}
-
-BoolFolding::BoolFoldingConsumerKind
-BoolFolding::getConsumerKind(const Inst *Instr) {
-  if (llvm::isa<InstBr>(Instr))
-    return CK_Br;
-  if (llvm::isa<InstSelect>(Instr))
-    return CK_Select;
-  return CK_None; // TODO(stichnot): remove this
-
-  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
-    switch (Cast->getCastKind()) {
-    default:
-      return CK_None;
-    case InstCast::Sext:
-      return CK_Sext;
-    case InstCast::Zext:
-      return CK_Zext;
-    }
-  }
-  return CK_None;
-}
-
-// Returns true if the producing instruction has a "complex" lowering
-// sequence.  This generally means that its lowering sequence requires
-// more than one conditional branch, namely 64-bit integer compares
-// and some floating-point compares.  When this is true, and there is
-// more than one consumer, we prefer to disable the folding
-// optimization because it minimizes branches.
-bool BoolFolding::hasComplexLowering(const Inst *Instr) {
-  switch (getProducerKind(Instr)) {
-  default:
-    return false;
-  case PK_Icmp64:
-    return true;
-  case PK_Fcmp:
-    return TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
-           CondX86::Br_None;
-  }
-}
-
-void BoolFolding::init(CfgNode *Node) {
-  Producers.clear();
-  for (Inst &Instr : Node->getInsts()) {
-    // Check whether Instr is a valid producer.
-    Variable *Var = Instr.getDest();
-    if (!Instr.isDeleted() // only consider non-deleted instructions
-        && Var             // only instructions with an actual dest var
-        && Var->getType() == IceType_i1          // only bool-type dest vars
-        && getProducerKind(&Instr) != PK_None) { // white-listed instructions
-      Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
-    }
-    // Check each src variable against the map.
-    for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
-      Operand *Src = Instr.getSrc(I);
-      SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J) {
-        const Variable *Var = Src->getVar(J);
-        SizeT VarNum = Var->getIndex();
-        if (containsValid(VarNum)) {
-          if (I != 0 // All valid consumers use Var as the first source operand
-              || getConsumerKind(&Instr) == CK_None // must be white-listed
-              || (Producers[VarNum].IsComplex && // complex can't be multi-use
-                  Producers[VarNum].NumUses > 0)) {
-            setInvalid(VarNum);
-            continue;
-          }
-          ++Producers[VarNum].NumUses;
-          if (Instr.isLastUse(Var)) {
-            Producers[VarNum].IsLiveOut = false;
-          }
-        }
-      }
-    }
-  }
-  for (auto &I : Producers) {
-    // Ignore entries previously marked invalid.
-    if (I.second.Instr == nullptr)
-      continue;
-    // Disable the producer if its dest may be live beyond this block.
-    if (I.second.IsLiveOut) {
-      setInvalid(I.first);
-      continue;
-    }
-    // Mark as "dead" rather than outright deleting.  This is so that
-    // other peephole style optimizations during or before lowering
-    // have access to this instruction in undeleted form.  See for
-    // example tryOptimizedCmpxchgCmpBr().
-    I.second.Instr->setDead();
-  }
-}
-
-const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
-  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
-  if (Var == nullptr)
-    return nullptr;
-  SizeT VarNum = Var->getIndex();
-  auto Element = Producers.find(VarNum);
-  if (Element == Producers.end())
-    return nullptr;
-  return Element->second.Instr;
-}
-
-void BoolFolding::dump(const Cfg *Func) const {
-  if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  for (auto &I : Producers) {
-    if (I.second.Instr == nullptr)
-      continue;
-    Str << "Found foldable producer:\n  ";
-    I.second.Instr->dump(Func);
-    Str << "\n";
-  }
-}
-
-void TargetX8632::initNodeForLowering(CfgNode *Node) {
-  FoldingInfo.init(Node);
-  FoldingInfo.dump(Func);
-}
-
-TargetX8632::TargetX8632(Cfg *Func) : TargetLowering(Func) {
-  static_assert((X86InstructionSet::End - X86InstructionSet::Begin) ==
-                    (TargetInstructionSet::X86InstructionSet_End -
-                     TargetInstructionSet::X86InstructionSet_Begin),
-                "X86InstructionSet range different from TargetInstructionSet");
-  if (Func->getContext()->getFlags().getTargetInstructionSet() !=
-      TargetInstructionSet::BaseInstructionSet) {
-    InstructionSet = static_cast<X86InstructionSet>(
-        (Func->getContext()->getFlags().getTargetInstructionSet() -
-         TargetInstructionSet::X86InstructionSet_Begin) +
-        X86InstructionSet::Begin);
-  }
-  // TODO: Don't initialize IntegerRegisters and friends every time.
-  // Instead, initialize in some sort of static initializer for the
-  // class.
-  llvm::SmallBitVector IntegerRegisters(RegX8632::Reg_NUM);
-  llvm::SmallBitVector IntegerRegistersI8(RegX8632::Reg_NUM);
-  llvm::SmallBitVector FloatRegisters(RegX8632::Reg_NUM);
-  llvm::SmallBitVector VectorRegisters(RegX8632::Reg_NUM);
-  llvm::SmallBitVector InvalidRegisters(RegX8632::Reg_NUM);
-  ScratchRegs.resize(RegX8632::Reg_NUM);
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  IntegerRegisters[RegX8632::val] = isInt;                                     \
-  IntegerRegistersI8[RegX8632::val] = isI8;                                    \
-  FloatRegisters[RegX8632::val] = isFP;                                        \
-  VectorRegisters[RegX8632::val] = isFP;                                       \
-  ScratchRegs[RegX8632::val] = scratch;
-  REGX8632_TABLE;
-#undef X
-  TypeToRegisterSet[IceType_void] = InvalidRegisters;
-  TypeToRegisterSet[IceType_i1] = IntegerRegistersI8;
-  TypeToRegisterSet[IceType_i8] = IntegerRegistersI8;
-  TypeToRegisterSet[IceType_i16] = IntegerRegisters;
-  TypeToRegisterSet[IceType_i32] = IntegerRegisters;
-  TypeToRegisterSet[IceType_i64] = IntegerRegisters;
-  TypeToRegisterSet[IceType_f32] = FloatRegisters;
-  TypeToRegisterSet[IceType_f64] = FloatRegisters;
-  TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
-  TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
-  TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
-  TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
-  TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
-  TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
-  TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
-}
-
-void TargetX8632::translateO2() {
-  TimerMarker T(TimerStack::TT_O2, Func);
-
-  if (!Ctx->getFlags().getPhiEdgeSplit()) {
-    // Lower Phi instructions.
-    Func->placePhiLoads();
-    if (Func->hasError())
-      return;
-    Func->placePhiStores();
-    if (Func->hasError())
-      return;
-    Func->deletePhis();
-    if (Func->hasError())
-      return;
-    Func->dump("After Phi lowering");
-  }
-
-  // Address mode optimization.
-  Func->getVMetadata()->init(VMK_SingleDefs);
-  Func->doAddressOpt();
-
-  // Find read-modify-write opportunities.  Do this after address mode
-  // optimization so that doAddressOpt() doesn't need to be applied to RMW
-  // instructions as well.
-  findRMW();
-  Func->dump("After RMW transform");
-
-  // Argument lowering
-  Func->doArgLowering();
-
-  // Target lowering.  This requires liveness analysis for some parts
-  // of the lowering decisions, such as compare/branch fusing.  If
-  // non-lightweight liveness analysis is used, the instructions need
-  // to be renumbered first.  TODO: This renumbering should only be
-  // necessary if we're actually calculating live intervals, which we
-  // only do for register allocation.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-
-  // TODO: It should be sufficient to use the fastest liveness
-  // calculation, i.e. livenessLightweight().  However, for some
-  // reason that slows down the rest of the translation.  Investigate.
-  Func->liveness(Liveness_Basic);
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 address mode opt");
-
-  // Disable constant blinding or pooling for load optimization.
-  {
-    BoolFlagSaver B(RandomizationPoolingPaused, true);
-    doLoadOpt();
-  }
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After x86 codegen");
-
-  // Register allocation.  This requires instruction renumbering and
-  // full liveness analysis.
-  Func->renumberInstructions();
-  if (Func->hasError())
-    return;
-  Func->liveness(Liveness_Intervals);
-  if (Func->hasError())
-    return;
-  // Validate the live range computations.  The expensive validation
-  // call is deliberately only made when assertions are enabled.
-  assert(Func->validateLiveness());
-  // The post-codegen dump is done here, after liveness analysis and
-  // associated cleanup, to make the dump cleaner and more useful.
-  Func->dump("After initial x8632 codegen");
-  Func->getVMetadata()->init(VMK_All);
-  regAlloc(RAK_Global);
-  if (Func->hasError())
-    return;
-  Func->dump("After linear scan regalloc");
-
-  if (Ctx->getFlags().getPhiEdgeSplit()) {
-    // We need to pause constant blinding or pooling during advanced
-    // phi lowering, unless the lowering assignment has a physical
-    // register for the dest Variable.
-    {
-      BoolFlagSaver B(RandomizationPoolingPaused, true);
-      Func->advancedPhiLowering();
-    }
-    Func->dump("After advanced Phi lowering");
-  }
-
-  // Stack frame mapping.
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-
-  Func->contractEmptyNodes();
-  Func->reorderNodes();
-
-  // Branch optimization.  This needs to be done just before code
-  // emission.  In particular, no transformations that insert or
-  // reorder CfgNodes should be done after branch optimization.  We go
-  // ahead and do it before nop insertion to reduce the amount of work
-  // needed for searching for opportunities.
-  Func->doBranchOpt();
-  Func->dump("After branch optimization");
-
-  // Nop insertion
-  if (Ctx->getFlags().shouldDoNopInsertion()) {
-    Func->doNopInsertion();
-  }
-}
-
-void TargetX8632::translateOm1() {
-  TimerMarker T(TimerStack::TT_Om1, Func);
-
-  Func->placePhiLoads();
-  if (Func->hasError())
-    return;
-  Func->placePhiStores();
-  if (Func->hasError())
-    return;
-  Func->deletePhis();
-  if (Func->hasError())
-    return;
-  Func->dump("After Phi lowering");
-
-  Func->doArgLowering();
-
-  Func->genCode();
-  if (Func->hasError())
-    return;
-  Func->dump("After initial x8632 codegen");
-
-  regAlloc(RAK_InfOnly);
-  if (Func->hasError())
-    return;
-  Func->dump("After regalloc of infinite-weight variables");
-
-  Func->genFrame();
-  if (Func->hasError())
-    return;
-  Func->dump("After stack frame mapping");
-
-  // Nop insertion
-  if (Ctx->getFlags().shouldDoNopInsertion()) {
-    Func->doNopInsertion();
-  }
-}
-
-namespace {
-
-bool canRMW(const InstArithmetic *Arith) {
-  Type Ty = Arith->getDest()->getType();
-  // X86 vector instructions write to a register and have no RMW
-  // option.
-  if (isVectorType(Ty))
-    return false;
-  bool isI64 = Ty == IceType_i64;
-
-  switch (Arith->getOp()) {
-  // Not handled for lack of simple lowering:
-  //   shift on i64
-  //   mul, udiv, urem, sdiv, srem, frem
-  // Not handled for lack of RMW instructions:
-  //   fadd, fsub, fmul, fdiv (also vector types)
-  default:
-    return false;
-  case InstArithmetic::Add:
-  case InstArithmetic::Sub:
-  case InstArithmetic::And:
-  case InstArithmetic::Or:
-  case InstArithmetic::Xor:
-    return true;
-  case InstArithmetic::Shl:
-  case InstArithmetic::Lshr:
-  case InstArithmetic::Ashr:
-    return false; // TODO(stichnot): implement
-    return !isI64;
-  }
-}
-
-bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
-  if (A == B)
-    return true;
-  if (auto *MemA = llvm::dyn_cast<OperandX8632Mem>(A)) {
-    if (auto *MemB = llvm::dyn_cast<OperandX8632Mem>(B)) {
-      return MemA->getBase() == MemB->getBase() &&
-             MemA->getOffset() == MemB->getOffset() &&
-             MemA->getIndex() == MemB->getIndex() &&
-             MemA->getShift() == MemB->getShift() &&
-             MemA->getSegmentRegister() == MemB->getSegmentRegister();
-    }
-  }
-  return false;
-}
-
-} // end of anonymous namespace
-
-void TargetX8632::findRMW() {
-  Func->dump("Before RMW");
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  for (CfgNode *Node : Func->getNodes()) {
-    // Walk through the instructions, considering each sequence of 3
-    // instructions, and look for the particular RMW pattern.  Note that this
-    // search can be "broken" (false negatives) if there are intervening deleted
-    // instructions, or intervening instructions that could be safely moved out
-    // of the way to reveal an RMW pattern.
-    auto E = Node->getInsts().end();
-    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
-    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
-      // Make I3 skip over deleted instructions.
-      while (I3 != E && I3->isDeleted())
-        ++I3;
-      if (I1 == E || I2 == E || I3 == E)
-        continue;
-      assert(!I1->isDeleted());
-      assert(!I2->isDeleted());
-      assert(!I3->isDeleted());
-      if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) {
-        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) {
-          if (auto *Store = llvm::dyn_cast<InstStore>(I3)) {
-            // Look for:
-            //   a = Load addr
-            //   b = <op> a, other
-            //   Store b, addr
-            // Change to:
-            //   a = Load addr
-            //   b = <op> a, other
-            //   x = FakeDef
-            //   RMW <op>, addr, other, x
-            //   b = Store b, addr, x
-            // Note that inferTwoAddress() makes sure setDestNonKillable() gets
-            // called on the updated Store instruction, to avoid liveness
-            // problems later.
-            //
-            // With this transformation, the Store instruction acquires a Dest
-            // variable and is now subject to dead code elimination if there are
-            // no more uses of "b".  Variable "x" is a beacon for determining
-            // whether the Store instruction gets dead-code eliminated.  If the
-            // Store instruction is eliminated, then it must be the case that
-            // the RMW instruction ends x's live range, and therefore the RMW
-            // instruction will be retained and later lowered.  On the other
-            // hand, if the RMW instruction does not end x's live range, then
-            // the Store instruction must still be present, and therefore the
-            // RMW instruction is ignored during lowering because it is
-            // redundant with the Store instruction.
-            //
-            // Note that if "a" has further uses, the RMW transformation may
-            // still trigger, resulting in two loads and one store, which is
-            // worse than the original one load and one store.  However, this is
-            // probably rare, and caching probably keeps it just as fast.
-            if (!isSameMemAddressOperand(Load->getSourceAddress(),
-                                         Store->getAddr()))
-              continue;
-            Operand *ArithSrcFromLoad = Arith->getSrc(0);
-            Operand *ArithSrcOther = Arith->getSrc(1);
-            if (ArithSrcFromLoad != Load->getDest()) {
-              if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
-                continue;
-              std::swap(ArithSrcFromLoad, ArithSrcOther);
-            }
-            if (Arith->getDest() != Store->getData())
-              continue;
-            if (!canRMW(Arith))
-              continue;
-            if (Func->isVerbose(IceV_RMW)) {
-              Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
-              Load->dump(Func);
-              Str << "\n  ";
-              Arith->dump(Func);
-              Str << "\n  ";
-              Store->dump(Func);
-              Str << "\n";
-            }
-            Variable *Beacon = Func->makeVariable(IceType_i32);
-            Beacon->setWeight(0);
-            Store->setRmwBeacon(Beacon);
-            InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
-            Node->getInsts().insert(I3, BeaconDef);
-            InstX8632FakeRMW *RMW = InstX8632FakeRMW::create(
-                Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
-            Node->getInsts().insert(I3, RMW);
-          }
-        }
-      }
-    }
-  }
-}
-
-namespace {
-
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-uint64_t getConstantMemoryOrder(Operand *Opnd) {
-  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return Integer->getValue();
-  return Intrinsics::MemoryOrderInvalid;
-}
-
-// Determines whether the dest of a Load instruction can be folded
-// into one of the src operands of a 2-operand instruction.  This is
-// true as long as the load dest matches exactly one of the binary
-// instruction's src operands.  Replaces Src0 or Src1 with LoadSrc if
-// the answer is true.
-bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
-                               Operand *&Src0, Operand *&Src1) {
-  if (Src0 == LoadDest && Src1 != LoadDest) {
-    Src0 = LoadSrc;
-    return true;
-  }
-  if (Src0 != LoadDest && Src1 == LoadDest) {
-    Src1 = LoadSrc;
-    return true;
-  }
-  return false;
-}
-
-} // end of anonymous namespace
-
-void TargetX8632::doLoadOpt() {
-  for (CfgNode *Node : Func->getNodes()) {
-    Context.init(Node);
-    while (!Context.atEnd()) {
-      Variable *LoadDest = nullptr;
-      Operand *LoadSrc = nullptr;
-      Inst *CurInst = Context.getCur();
-      Inst *Next = Context.getNextInst();
-      // Determine whether the current instruction is a Load
-      // instruction or equivalent.
-      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
-        // An InstLoad always qualifies.
-        LoadDest = Load->getDest();
-        const bool DoLegalize = false;
-        LoadSrc = formMemoryOperand(Load->getSourceAddress(),
-                                    LoadDest->getType(), DoLegalize);
-      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
-        // An AtomicLoad intrinsic qualifies as long as it has a valid
-        // memory ordering, and can be implemented in a single
-        // instruction (i.e., not i64).
-        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
-        if (ID == Intrinsics::AtomicLoad &&
-            Intrin->getDest()->getType() != IceType_i64 &&
-            Intrinsics::isMemoryOrderValid(
-                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
-          LoadDest = Intrin->getDest();
-          const bool DoLegalize = false;
-          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
-                                      DoLegalize);
-        }
-      }
-      // A Load instruction can be folded into the following
-      // instruction only if the following instruction ends the Load's
-      // Dest variable's live range.
-      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
-        assert(LoadSrc);
-        Inst *NewInst = nullptr;
-        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
-          Operand *Src0 = Arith->getSrc(0);
-          Operand *Src1 = Arith->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstArithmetic::create(Func, Arith->getOp(),
-                                             Arith->getDest(), Src0, Src1);
-          }
-        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
-          Operand *Src0 = Icmp->getSrc(0);
-          Operand *Src1 = Icmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
-                                       Icmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
-          Operand *Src0 = Fcmp->getSrc(0);
-          Operand *Src1 = Fcmp->getSrc(1);
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
-                                       Fcmp->getDest(), Src0, Src1);
-          }
-        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
-          Operand *Src0 = Select->getTrueOperand();
-          Operand *Src1 = Select->getFalseOperand();
-          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
-            NewInst = InstSelect::create(Func, Select->getDest(),
-                                         Select->getCondition(), Src0, Src1);
-          }
-        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
-          // The load dest can always be folded into a Cast
-          // instruction.
-          Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
-          if (Src0 == LoadDest) {
-            NewInst = InstCast::create(Func, Cast->getCastKind(),
-                                       Cast->getDest(), LoadSrc);
-          }
-        }
-        if (NewInst) {
-          CurInst->setDeleted();
-          Next->setDeleted();
-          Context.insert(NewInst);
-          // Update NewInst->LiveRangesEnded so that target lowering
-          // may benefit.  Also update NewInst->HasSideEffects.
-          NewInst->spliceLivenessInfo(Next, CurInst);
-        }
-      }
-      Context.advanceCur();
-      Context.advanceNext();
-    }
-  }
-  Func->dump("After load optimization");
-}
-
-bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
-  if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
-    return Br->optimizeBranch(NextNode);
-  }
-  return false;
-}
-
-IceString TargetX8632::RegNames[] = {
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  name,
-    REGX8632_TABLE
-#undef X
-};
-
-Variable *TargetX8632::getPhysicalRegister(SizeT RegNum, Type Ty) {
-  if (Ty == IceType_void)
-    Ty = IceType_i32;
-  if (PhysicalRegisters[Ty].empty())
-    PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
-  assert(RegNum < PhysicalRegisters[Ty].size());
-  Variable *Reg = PhysicalRegisters[Ty][RegNum];
-  if (Reg == nullptr) {
-    Reg = Func->makeVariable(Ty);
-    Reg->setRegNum(RegNum);
-    PhysicalRegisters[Ty][RegNum] = Reg;
-    // Specially mark esp as an "argument" so that it is considered
-    // live upon function entry.
-    if (RegNum == RegX8632::Reg_esp) {
-      Func->addImplicitArg(Reg);
-      Reg->setIgnoreLiveness();
-    }
-  }
-  return Reg;
-}
-
-IceString TargetX8632::getRegName(SizeT RegNum, Type Ty) const {
-  assert(RegNum < RegX8632::Reg_NUM);
-  static IceString RegNames8[] = {
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  name8,
-      REGX8632_TABLE
-#undef X
-  };
-  static IceString RegNames16[] = {
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  name16,
-      REGX8632_TABLE
-#undef X
-  };
-  switch (Ty) {
-  case IceType_i1:
-  case IceType_i8:
-    return RegNames8[RegNum];
-  case IceType_i16:
-    return RegNames16[RegNum];
-  default:
-    return RegNames[RegNum];
-  }
-}
-
-void TargetX8632::emitVariable(const Variable *Var) const {
-  Ostream &Str = Ctx->getStrEmit();
-  if (Var->hasReg()) {
-    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
-    return;
-  }
-  if (Var->getWeight().isInf()) {
-    llvm_unreachable("Infinite-weight Variable has no register assigned");
-  }
-  int32_t Offset = Var->getStackOffset();
-  if (!hasFramePointer())
-    Offset += getStackAdjustment();
-  if (Offset)
-    Str << Offset;
-  const Type FrameSPTy = IceType_i32;
-  Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
-}
-
-X8632::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
-  if (Var->hasReg())
-    llvm_unreachable("Stack Variable has a register assigned");
-  if (Var->getWeight().isInf()) {
-    llvm_unreachable("Infinite-weight Variable has no register assigned");
-  }
-  int32_t Offset = Var->getStackOffset();
-  if (!hasFramePointer())
-    Offset += getStackAdjustment();
-  return X8632::Address(RegX8632::getEncodedGPR(getFrameOrStackReg()), Offset);
-}
-
-void TargetX8632::lowerArguments() {
-  VarList &Args = Func->getArgs();
-  // The first four arguments of vector type, regardless of their
-  // position relative to the other arguments in the argument list, are
-  // passed in registers xmm0 - xmm3.
-  unsigned NumXmmArgs = 0;
-
-  Context.init(Func->getEntryNode());
-  Context.setInsertPoint(Context.getCur());
-
-  for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS;
-       ++I) {
-    Variable *Arg = Args[I];
-    Type Ty = Arg->getType();
-    if (!isVectorType(Ty))
-      continue;
-    // Replace Arg in the argument list with the home register.  Then
-    // generate an instruction in the prolog to copy the home register
-    // to the assigned location of Arg.
-    int32_t RegNum = RegX8632::Reg_xmm0 + NumXmmArgs;
-    ++NumXmmArgs;
-    Variable *RegisterArg = Func->makeVariable(Ty);
-    if (ALLOW_DUMP)
-      RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
-    RegisterArg->setRegNum(RegNum);
-    RegisterArg->setIsArg();
-    Arg->setIsArg(false);
-
-    Args[I] = RegisterArg;
-    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
-  }
-}
-
-// Helper function for addProlog().
-//
-// This assumes Arg is an argument passed on the stack.  This sets the
-// frame offset for Arg and updates InArgsSizeBytes according to Arg's
-// width.  For an I64 arg that has been split into Lo and Hi components,
-// it calls itself recursively on the components, taking care to handle
-// Lo first because of the little-endian architecture.  Lastly, this
-// function generates an instruction to copy Arg into its assigned
-// register if applicable.
-void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                                         size_t BasicFrameOffset,
-                                         size_t &InArgsSizeBytes) {
-  Variable *Lo = Arg->getLo();
-  Variable *Hi = Arg->getHi();
-  Type Ty = Arg->getType();
-  if (Lo && Hi && Ty == IceType_i64) {
-    assert(Lo->getType() != IceType_i64); // don't want infinite recursion
-    assert(Hi->getType() != IceType_i64); // don't want infinite recursion
-    finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
-    finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
-    return;
-  }
-  if (isVectorType(Ty)) {
-    InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
-  }
-  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
-  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
-  if (Arg->hasReg()) {
-    assert(Ty != IceType_i64);
-    OperandX8632Mem *Mem = OperandX8632Mem::create(
-        Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
-    if (isVectorType(Arg->getType())) {
-      _movp(Arg, Mem);
-    } else {
-      _mov(Arg, Mem);
-    }
-    // This argument-copying instruction uses an explicit
-    // OperandX8632Mem operand instead of a Variable, so its
-    // fill-from-stack operation has to be tracked separately for
-    // statistics.
-    Ctx->statsUpdateFills();
-  }
-}
-
-Type TargetX8632::stackSlotType() { return IceType_i32; }
-
-void TargetX8632::addProlog(CfgNode *Node) {
-  // Stack frame layout:
-  //
-  // +------------------------+
-  // | 1. return address      |
-  // +------------------------+
-  // | 2. preserved registers |
-  // +------------------------+
-  // | 3. padding             |
-  // +------------------------+
-  // | 4. global spill area   |
-  // +------------------------+
-  // | 5. padding             |
-  // +------------------------+
-  // | 6. local spill area    |
-  // +------------------------+
-  // | 7. padding             |
-  // +------------------------+
-  // | 8. allocas             |
-  // +------------------------+
-  //
-  // The following variables record the size in bytes of the given areas:
-  //  * X86_RET_IP_SIZE_BYTES:  area 1
-  //  * PreservedRegsSizeBytes: area 2
-  //  * SpillAreaPaddingBytes:  area 3
-  //  * GlobalsSize:            area 4
-  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
-  //  * LocalsSpillAreaSize:    area 6
-  //  * SpillAreaSizeBytes:     areas 3 - 7
-
-  // Determine stack frame offsets for each Variable without a
-  // register assignment.  This can be done as one variable per stack
-  // slot.  Or, do coalescing by running the register allocator again
-  // with an infinite set of registers (as a side effect, this gives
-  // variables a second chance at physical register assignment).
-  //
-  // A middle ground approach is to leverage sparsity and allocate one
-  // block of space on the frame for globals (variables with
-  // multi-block lifetime), and one block to share for locals
-  // (single-block lifetime).
-
-  Context.init(Node);
-  Context.setInsertPoint(Context.getCur());
-
-  llvm::SmallBitVector CalleeSaves =
-      getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
-  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
-  size_t GlobalsSize = 0;
-  // If there is a separate locals area, this represents that area.
-  // Otherwise it counts any variable not counted by GlobalsSize.
-  SpillAreaSizeBytes = 0;
-  // If there is a separate locals area, this specifies the alignment
-  // for it.
-  uint32_t LocalsSlotsAlignmentBytes = 0;
-  // The entire spill locations area gets aligned to largest natural
-  // alignment of the variables that have a spill slot.
-  uint32_t SpillAreaAlignmentBytes = 0;
-  // A spill slot linked to a variable with a stack slot should reuse
-  // that stack slot.
-  std::function<bool(Variable *)> TargetVarHook =
-      [&VariablesLinkedToSpillSlots](Variable *Var) {
-        if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
-          assert(Var->getWeight().isZero());
-          if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
-            VariablesLinkedToSpillSlots.push_back(Var);
-            return true;
-          }
-        }
-        return false;
-      };
-
-  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
-  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
-                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
-                        &LocalsSlotsAlignmentBytes, TargetVarHook);
-  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
-  SpillAreaSizeBytes += GlobalsSize;
-
-  // Add push instructions for preserved registers.
-  uint32_t NumCallee = 0;
-  size_t PreservedRegsSizeBytes = 0;
-  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
-    if (CalleeSaves[i] && RegsUsed[i]) {
-      ++NumCallee;
-      PreservedRegsSizeBytes += 4;
-      _push(getPhysicalRegister(i));
-    }
-  }
-  Ctx->statsUpdateRegistersSaved(NumCallee);
-
-  // Generate "push ebp; mov ebp, esp"
-  if (IsEbpBasedFrame) {
-    assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
-               .count() == 0);
-    PreservedRegsSizeBytes += 4;
-    Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
-    Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
-    _push(ebp);
-    _mov(ebp, esp);
-    // Keep ebp live for late-stage liveness analysis
-    // (e.g. asm-verbose mode).
-    Context.insert(InstFakeUse::create(Func, ebp));
-  }
-
-  // Align the variables area. SpillAreaPaddingBytes is the size of
-  // the region after the preserved registers and before the spill areas.
-  // LocalsSlotsPaddingBytes is the amount of padding between the globals
-  // and locals area if they are separate.
-  assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
-  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
-  uint32_t SpillAreaPaddingBytes = 0;
-  uint32_t LocalsSlotsPaddingBytes = 0;
-  alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
-                       SpillAreaAlignmentBytes, GlobalsSize,
-                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
-                       &LocalsSlotsPaddingBytes);
-  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
-  uint32_t GlobalsAndSubsequentPaddingSize =
-      GlobalsSize + LocalsSlotsPaddingBytes;
-
-  // Align esp if necessary.
-  if (NeedsStackAlignment) {
-    uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-    uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
-    SpillAreaSizeBytes = StackSize - StackOffset;
-  }
-
-  // Generate "sub esp, SpillAreaSizeBytes"
-  if (SpillAreaSizeBytes)
-    _sub(getPhysicalRegister(RegX8632::Reg_esp),
-         Ctx->getConstantInt32(SpillAreaSizeBytes));
-  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
-
-  resetStackAdjustment();
-
-  // Fill in stack offsets for stack args, and copy args into registers
-  // for those that were register-allocated.  Args are pushed right to
-  // left, so Arg[0] is closest to the stack/frame pointer.
-  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
-  size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;
-  if (!IsEbpBasedFrame)
-    BasicFrameOffset += SpillAreaSizeBytes;
-
-  const VarList &Args = Func->getArgs();
-  size_t InArgsSizeBytes = 0;
-  unsigned NumXmmArgs = 0;
-  for (Variable *Arg : Args) {
-    // Skip arguments passed in registers.
-    if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
-      ++NumXmmArgs;
-      continue;
-    }
-    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
-  }
-
-  // Fill in stack offsets for locals.
-  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
-                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
-                      IsEbpBasedFrame);
-  // Assign stack offsets to variables that have been linked to spilled
-  // variables.
-  for (Variable *Var : VariablesLinkedToSpillSlots) {
-    Variable *Linked = (llvm::cast<SpillVariable>(Var))->getLinkedTo();
-    Var->setStackOffset(Linked->getStackOffset());
-  }
-  this->HasComputedFrame = true;
-
-  if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-
-    Str << "Stack layout:\n";
-    uint32_t EspAdjustmentPaddingSize =
-        SpillAreaSizeBytes - LocalsSpillAreaSize -
-        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
-    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
-        << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
-        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
-        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
-        << " globals spill area = " << GlobalsSize << " bytes\n"
-        << " globals-locals spill areas intermediate padding = "
-        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
-        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
-        << " esp alignment padding = " << EspAdjustmentPaddingSize
-        << " bytes\n";
-
-    Str << "Stack details:\n"
-        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
-        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
-        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
-        << " bytes\n"
-        << " is ebp based = " << IsEbpBasedFrame << "\n";
-  }
-}
-
-void TargetX8632::addEpilog(CfgNode *Node) {
-  InstList &Insts = Node->getInsts();
-  InstList::reverse_iterator RI, E;
-  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
-    if (llvm::isa<InstX8632Ret>(*RI))
-      break;
-  }
-  if (RI == E)
-    return;
-
-  // Convert the reverse_iterator position into its corresponding
-  // (forward) iterator position.
-  InstList::iterator InsertPoint = RI.base();
-  --InsertPoint;
-  Context.init(Node);
-  Context.setInsertPoint(InsertPoint);
-
-  Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
-  if (IsEbpBasedFrame) {
-    Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
-    // For late-stage liveness analysis (e.g. asm-verbose mode),
-    // adding a fake use of esp before the assignment of esp=ebp keeps
-    // previous esp adjustments from being dead-code eliminated.
-    Context.insert(InstFakeUse::create(Func, esp));
-    _mov(esp, ebp);
-    _pop(ebp);
-  } else {
-    // add esp, SpillAreaSizeBytes
-    if (SpillAreaSizeBytes)
-      _add(esp, Ctx->getConstantInt32(SpillAreaSizeBytes));
-  }
-
-  // Add pop instructions for preserved registers.
-  llvm::SmallBitVector CalleeSaves =
-      getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
-    SizeT j = CalleeSaves.size() - i - 1;
-    if (j == RegX8632::Reg_ebp && IsEbpBasedFrame)
-      continue;
-    if (CalleeSaves[j] && RegsUsed[j]) {
-      _pop(getPhysicalRegister(j));
-    }
-  }
-
-  if (!Ctx->getFlags().getUseSandboxing())
-    return;
-  // Change the original ret instruction into a sandboxed return sequence.
-  // t:ecx = pop
-  // bundle_lock
-  // and t, ~31
-  // jmp *t
-  // bundle_unlock
-  // FakeUse <original_ret_operand>
-  const SizeT BundleSize = 1
-                           << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-  Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
-  _pop(T_ecx);
-  _bundle_lock();
-  _and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
-  _jmp(T_ecx);
-  _bundle_unlock();
-  if (RI->getSrcSize()) {
-    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
-    Context.insert(InstFakeUse::create(Func, RetValue));
-  }
-  RI->setDeleted();
-}
-
-void TargetX8632::split64(Variable *Var) {
-  switch (Var->getType()) {
-  default:
-    return;
-  case IceType_i64:
-  // TODO: Only consider F64 if we need to push each half when
-  // passing as an argument to a function call.  Note that each half
-  // is still typed as I32.
-  case IceType_f64:
-    break;
-  }
-  Variable *Lo = Var->getLo();
-  Variable *Hi = Var->getHi();
-  if (Lo) {
-    assert(Hi);
-    return;
-  }
-  assert(Hi == nullptr);
-  Lo = Func->makeVariable(IceType_i32);
-  Hi = Func->makeVariable(IceType_i32);
-  if (ALLOW_DUMP) {
-    Lo->setName(Func, Var->getName(Func) + "__lo");
-    Hi->setName(Func, Var->getName(Func) + "__hi");
-  }
-  Var->setLoHi(Lo, Hi);
-  if (Var->getIsArg()) {
-    Lo->setIsArg();
-    Hi->setIsArg();
-  }
-}
-
-Operand *TargetX8632::loOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
-    split64(Var);
-    return Var->getLo();
-  }
-  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    ConstantInteger32 *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
-    return legalize(ConstInt);
-  }
-  if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
-    OperandX8632Mem *MemOperand = OperandX8632Mem::create(
-        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister());
-    // Test if we should randomize or pool the offset, if so randomize it or
-    // pool it then create mem operand with the blinded/pooled constant.
-    // Otherwise, return the mem operand as ordinary mem operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-Operand *TargetX8632::hiOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64 ||
-         Operand->getType() == IceType_f64);
-  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
-    return Operand;
-  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
-    split64(Var);
-    return Var->getHi();
-  }
-  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
-    ConstantInteger32 *ConstInt = llvm::dyn_cast<ConstantInteger32>(
-        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
-    // check if we need to blind/pool the constant
-    return legalize(ConstInt);
-  }
-  if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
-    Constant *Offset = Mem->getOffset();
-    if (Offset == nullptr) {
-      Offset = Ctx->getConstantInt32(4);
-    } else if (ConstantInteger32 *IntOffset =
-                   llvm::dyn_cast<ConstantInteger32>(Offset)) {
-      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
-    } else if (ConstantRelocatable *SymOffset =
-                   llvm::dyn_cast<ConstantRelocatable>(Offset)) {
-      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
-      Offset =
-          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName(),
-                              SymOffset->getSuppressMangling());
-    }
-    OperandX8632Mem *MemOperand = OperandX8632Mem::create(
-        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister());
-    // Test if the Offset is an eligible i32 constants for randomization and
-    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
-    // operand.
-    return legalize(MemOperand);
-  }
-  llvm_unreachable("Unsupported operand type");
-  return nullptr;
-}
-
-llvm::SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
-                                                 RegSetMask Exclude) const {
-  llvm::SmallBitVector Registers(RegX8632::Reg_NUM);
-
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  if (scratch && (Include & RegSet_CallerSave))                                \
-    Registers[RegX8632::val] = true;                                           \
-  if (preserved && (Include & RegSet_CalleeSave))                              \
-    Registers[RegX8632::val] = true;                                           \
-  if (stackptr && (Include & RegSet_StackPointer))                             \
-    Registers[RegX8632::val] = true;                                           \
-  if (frameptr && (Include & RegSet_FramePointer))                             \
-    Registers[RegX8632::val] = true;                                           \
-  if (scratch && (Exclude & RegSet_CallerSave))                                \
-    Registers[RegX8632::val] = false;                                          \
-  if (preserved && (Exclude & RegSet_CalleeSave))                              \
-    Registers[RegX8632::val] = false;                                          \
-  if (stackptr && (Exclude & RegSet_StackPointer))                             \
-    Registers[RegX8632::val] = false;                                          \
-  if (frameptr && (Exclude & RegSet_FramePointer))                             \
-    Registers[RegX8632::val] = false;
-
-  REGX8632_TABLE
-
-#undef X
-
-  return Registers;
-}
-
-void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
-  IsEbpBasedFrame = true;
-  // Conservatively require the stack to be aligned.  Some stack
-  // adjustment operations implemented below assume that the stack is
-  // aligned before the alloca.  All the alloca code ensures that the
-  // stack alignment is preserved after the alloca.  The stack alignment
-  // restriction can be relaxed in some cases.
-  NeedsStackAlignment = true;
-
-  // TODO(stichnot): minimize the number of adjustments of esp, etc.
-  Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
-  Operand *TotalSize = legalize(Inst->getSizeInBytes());
-  Variable *Dest = Inst->getDest();
-  uint32_t AlignmentParam = Inst->getAlignInBytes();
-  // For default align=0, set it to the real value 1, to avoid any
-  // bit-manipulation problems below.
-  AlignmentParam = std::max(AlignmentParam, 1u);
-
-  // LLVM enforces power of 2 alignment.
-  assert(llvm::isPowerOf2_32(AlignmentParam));
-  assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
-
-  uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
-  if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
-    _and(esp, Ctx->getConstantInt32(-Alignment));
-  }
-  if (const auto *ConstantTotalSize =
-          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    uint32_t Value = ConstantTotalSize->getValue();
-    Value = Utils::applyAlignment(Value, Alignment);
-    _sub(esp, Ctx->getConstantInt32(Value));
-  } else {
-    // Non-constant sizes need to be adjusted to the next highest
-    // multiple of the required alignment at runtime.
-    Variable *T = makeReg(IceType_i32);
-    _mov(T, TotalSize);
-    _add(T, Ctx->getConstantInt32(Alignment - 1));
-    _and(T, Ctx->getConstantInt32(-Alignment));
-    _sub(esp, T);
-  }
-  _mov(Dest, esp);
-}
-
-// Strength-reduce scalar integer multiplication by a constant (for
-// i32 or narrower) for certain constants.  The lea instruction can be
-// used to multiply by 3, 5, or 9, and the lsh instruction can be used
-// to multiply by powers of 2.  These can be combined such that
-// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
-// combined with left-shifting by 2.
-bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
-                                    int32_t Src1) {
-  // Disable this optimization for Om1 and O0, just to keep things
-  // simple there.
-  if (Ctx->getFlags().getOptLevel() < Opt_1)
-    return false;
-  Type Ty = Dest->getType();
-  Variable *T = nullptr;
-  if (Src1 == -1) {
-    _mov(T, Src0);
-    _neg(T);
-    _mov(Dest, T);
-    return true;
-  }
-  if (Src1 == 0) {
-    _mov(Dest, Ctx->getConstantZero(Ty));
-    return true;
-  }
-  if (Src1 == 1) {
-    _mov(T, Src0);
-    _mov(Dest, T);
-    return true;
-  }
-  // Don't bother with the edge case where Src1 == MININT.
-  if (Src1 == -Src1)
-    return false;
-  const bool Src1IsNegative = Src1 < 0;
-  if (Src1IsNegative)
-    Src1 = -Src1;
-  uint32_t Count9 = 0;
-  uint32_t Count5 = 0;
-  uint32_t Count3 = 0;
-  uint32_t Count2 = 0;
-  uint32_t CountOps = 0;
-  while (Src1 > 1) {
-    if (Src1 % 9 == 0) {
-      ++CountOps;
-      ++Count9;
-      Src1 /= 9;
-    } else if (Src1 % 5 == 0) {
-      ++CountOps;
-      ++Count5;
-      Src1 /= 5;
-    } else if (Src1 % 3 == 0) {
-      ++CountOps;
-      ++Count3;
-      Src1 /= 3;
-    } else if (Src1 % 2 == 0) {
-      if (Count2 == 0)
-        ++CountOps;
-      ++Count2;
-      Src1 /= 2;
-    } else {
-      return false;
-    }
-  }
-  // Lea optimization only works for i16 and i32 types, not i8.
-  if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9))
-    return false;
-  // Limit the number of lea/shl operations for a single multiply, to
-  // a somewhat arbitrary choice of 3.
-  const uint32_t MaxOpsForOptimizedMul = 3;
-  if (CountOps > MaxOpsForOptimizedMul)
-    return false;
-  _mov(T, Src0);
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  for (uint32_t i = 0; i < Count9; ++i) {
-    const uint16_t Shift = 3; // log2(9-1)
-    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
-    _set_dest_nonkillable();
-  }
-  for (uint32_t i = 0; i < Count5; ++i) {
-    const uint16_t Shift = 2; // log2(5-1)
-    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
-    _set_dest_nonkillable();
-  }
-  for (uint32_t i = 0; i < Count3; ++i) {
-    const uint16_t Shift = 1; // log2(3-1)
-    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
-    _set_dest_nonkillable();
-  }
-  if (Count2) {
-    _shl(T, Ctx->getConstantInt(Ty, Count2));
-  }
-  if (Src1IsNegative)
-    _neg(T);
-  _mov(Dest, T);
-  return true;
-}
-
-void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
-  Variable *Dest = Inst->getDest();
-  Operand *Src0 = legalize(Inst->getSrc(0));
-  Operand *Src1 = legalize(Inst->getSrc(1));
-  if (Inst->isCommutative()) {
-    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
-      std::swap(Src0, Src1);
-    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
-      std::swap(Src0, Src1);
-  }
-  if (Dest->getType() == IceType_i64) {
-    // These helper-call-involved instructions are lowered in this
-    // separate switch. This is because loOperand() and hiOperand()
-    // may insert redundant instructions for constant blinding and
-    // pooling. Such redundant instructions will fail liveness analysis
-    // under -Om1 setting. And, actually these arguments do not need
-    // to be processed with loOperand() and hiOperand() to be used.
-    switch (Inst->getOp()) {
-    case InstArithmetic::Udiv: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      Call->addArg(Inst->getSrc(1));
-      lowerCall(Call);
-      return;
-    }
-    case InstArithmetic::Sdiv: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(H_sdiv_i64, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      Call->addArg(Inst->getSrc(1));
-      lowerCall(Call);
-      return;
-    }
-    case InstArithmetic::Urem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(H_urem_i64, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      Call->addArg(Inst->getSrc(1));
-      lowerCall(Call);
-      return;
-    }
-    case InstArithmetic::Srem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(H_srem_i64, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      Call->addArg(Inst->getSrc(1));
-      lowerCall(Call);
-      return;
-    }
-    default:
-      break;
-    }
-
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    Operand *Src1Lo = loOperand(Src1);
-    Operand *Src1Hi = hiOperand(Src1);
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    switch (Inst->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add:
-      _mov(T_Lo, Src0Lo);
-      _add(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _adc(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::And:
-      _mov(T_Lo, Src0Lo);
-      _and(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _and(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Or:
-      _mov(T_Lo, Src0Lo);
-      _or(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _or(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Xor:
-      _mov(T_Lo, Src0Lo);
-      _xor(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _xor(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Sub:
-      _mov(T_Lo, Src0Lo);
-      _sub(T_Lo, Src1Lo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, Src0Hi);
-      _sbb(T_Hi, Src1Hi);
-      _mov(DestHi, T_Hi);
-      break;
-    case InstArithmetic::Mul: {
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
-      Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
-      // gcc does the following:
-      // a=b*c ==>
-      //   t1 = b.hi; t1 *=(imul) c.lo
-      //   t2 = c.hi; t2 *=(imul) b.lo
-      //   t3:eax = b.lo
-      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
-      //   a.lo = t4.lo
-      //   t4.hi += t1
-      //   t4.hi += t2
-      //   a.hi = t4.hi
-      // The mul instruction cannot take an immediate operand.
-      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
-      _mov(T_1, Src0Hi);
-      _imul(T_1, Src1Lo);
-      _mov(T_2, Src1Hi);
-      _imul(T_2, Src0Lo);
-      _mov(T_3, Src0Lo, RegX8632::Reg_eax);
-      _mul(T_4Lo, T_3, Src1Lo);
-      // The mul instruction produces two dest variables, edx:eax.  We
-      // create a fake definition of edx to account for this.
-      Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo));
-      _mov(DestLo, T_4Lo);
-      _add(T_4Hi, T_1);
-      _add(T_4Hi, T_2);
-      _mov(DestHi, T_4Hi);
-    } break;
-    case InstArithmetic::Shl: {
-      // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
-      // gcc does the following:
-      // a=b<<c ==>
-      //   t1:ecx = c.lo & 0xff
-      //   t2 = b.lo
-      //   t3 = b.hi
-      //   t3 = shld t3, t2, t1
-      //   t2 = shl t2, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t3)
-      //   t3 = t2
-      //   t2 = 0
-      // L1:
-      //   a.lo = t2
-      //   a.hi = t3
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Constant *BitTest = Ctx->getConstantInt32(0x20);
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      InstX8632Label *Label = InstX8632Label::create(Func, this);
-      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
-      _mov(T_2, Src0Lo);
-      _mov(T_3, Src0Hi);
-      _shld(T_3, T_2, T_1);
-      _shl(T_2, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the
-      // intra-block control flow, so we need the _mov_nonkillable
-      // variant to avoid liveness problems.
-      _mov_nonkillable(T_3, T_2);
-      _mov_nonkillable(T_2, Zero);
-      Context.insert(Label);
-      _mov(DestLo, T_2);
-      _mov(DestHi, T_3);
-    } break;
-    case InstArithmetic::Lshr: {
-      // a=b>>c (unsigned) ==>
-      //   t1:ecx = c.lo & 0xff
-      //   t2 = b.lo
-      //   t3 = b.hi
-      //   t2 = shrd t2, t3, t1
-      //   t3 = shr t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = 0
-      // L1:
-      //   a.lo = t2
-      //   a.hi = t3
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Constant *BitTest = Ctx->getConstantInt32(0x20);
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      InstX8632Label *Label = InstX8632Label::create(Func, this);
-      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
-      _mov(T_2, Src0Lo);
-      _mov(T_3, Src0Hi);
-      _shrd(T_2, T_3, T_1);
-      _shr(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the
-      // intra-block control flow, so we need the _mov_nonkillable
-      // variant to avoid liveness problems.
-      _mov_nonkillable(T_2, T_3);
-      _mov_nonkillable(T_3, Zero);
-      Context.insert(Label);
-      _mov(DestLo, T_2);
-      _mov(DestHi, T_3);
-    } break;
-    case InstArithmetic::Ashr: {
-      // a=b>>c (signed) ==>
-      //   t1:ecx = c.lo & 0xff
-      //   t2 = b.lo
-      //   t3 = b.hi
-      //   t2 = shrd t2, t3, t1
-      //   t3 = sar t3, t1
-      //   test t1, 0x20
-      //   je L1
-      //   use(t2)
-      //   t2 = t3
-      //   t3 = sar t3, 0x1f
-      // L1:
-      //   a.lo = t2
-      //   a.hi = t3
-      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
-      Constant *BitTest = Ctx->getConstantInt32(0x20);
-      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
-      InstX8632Label *Label = InstX8632Label::create(Func, this);
-      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
-      _mov(T_2, Src0Lo);
-      _mov(T_3, Src0Hi);
-      _shrd(T_2, T_3, T_1);
-      _sar(T_3, T_1);
-      _test(T_1, BitTest);
-      _br(CondX86::Br_e, Label);
-      // T_2 and T_3 are being assigned again because of the
-      // intra-block control flow, so T_2 needs the _mov_nonkillable
-      // variant to avoid liveness problems.  T_3 doesn't need special
-      // treatment because it is reassigned via _sar instead of _mov.
-      _mov_nonkillable(T_2, T_3);
-      _sar(T_3, SignExtend);
-      Context.insert(Label);
-      _mov(DestLo, T_2);
-      _mov(DestHi, T_3);
-    } break;
-    case InstArithmetic::Fadd:
-    case InstArithmetic::Fsub:
-    case InstArithmetic::Fmul:
-    case InstArithmetic::Fdiv:
-    case InstArithmetic::Frem:
-      llvm_unreachable("FP instruction with i64 type");
-      break;
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Srem:
-      llvm_unreachable("Call-helper-involved instruction for i64 type \
-                       should have already been handled before");
-      break;
-    }
-    return;
-  }
-  if (isVectorType(Dest->getType())) {
-    // TODO: Trap on integer divide and integer modulo by zero.
-    // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
-    if (llvm::isa<OperandX8632Mem>(Src1))
-      Src1 = legalizeToVar(Src1);
-    switch (Inst->getOp()) {
-    case InstArithmetic::_num:
-      llvm_unreachable("Unknown arithmetic operator");
-      break;
-    case InstArithmetic::Add: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _padd(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::And: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _pand(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Or: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _por(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Xor: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _pxor(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Sub: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _psub(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Mul: {
-      bool TypesAreValidForPmull =
-          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
-      bool InstructionSetIsValidForPmull =
-          Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
-      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
-        Variable *T = makeReg(Dest->getType());
-        _movp(T, Src0);
-        _pmull(T, Src1);
-        _movp(Dest, T);
-      } else if (Dest->getType() == IceType_v4i32) {
-        // Lowering sequence:
-        // Note: The mask arguments have index 0 on the left.
-        //
-        // movups  T1, Src0
-        // pshufd  T2, Src0, {1,0,3,0}
-        // pshufd  T3, Src1, {1,0,3,0}
-        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
-        // pmuludq T1, Src1
-        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
-        // pmuludq T2, T3
-        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
-        // shufps  T1, T2, {0,2,0,2}
-        // pshufd  T4, T1, {0,2,1,3}
-        // movups  Dest, T4
-
-        // Mask that directs pshufd to create a vector with entries
-        // Src[1, 0, 3, 0]
-        const unsigned Constant1030 = 0x31;
-        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
-        // Mask that directs shufps to create a vector with entries
-        // Dest[0, 2], Src[0, 2]
-        const unsigned Mask0202 = 0x88;
-        // Mask that directs pshufd to create a vector with entries
-        // Src[0, 2, 1, 3]
-        const unsigned Mask0213 = 0xd8;
-        Variable *T1 = makeReg(IceType_v4i32);
-        Variable *T2 = makeReg(IceType_v4i32);
-        Variable *T3 = makeReg(IceType_v4i32);
-        Variable *T4 = makeReg(IceType_v4i32);
-        _movp(T1, Src0);
-        _pshufd(T2, Src0, Mask1030);
-        _pshufd(T3, Src1, Mask1030);
-        _pmuludq(T1, Src1);
-        _pmuludq(T2, T3);
-        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
-        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
-        _movp(Dest, T4);
-      } else {
-        assert(Dest->getType() == IceType_v16i8);
-        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
-      }
-    } break;
-    case InstArithmetic::Shl:
-    case InstArithmetic::Lshr:
-    case InstArithmetic::Ashr:
-    case InstArithmetic::Udiv:
-    case InstArithmetic::Urem:
-    case InstArithmetic::Sdiv:
-    case InstArithmetic::Srem:
-      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
-      break;
-    case InstArithmetic::Fadd: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _addps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fsub: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _subps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fmul: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _mulps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Fdiv: {
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0);
-      _divps(T, Src1);
-      _movp(Dest, T);
-    } break;
-    case InstArithmetic::Frem:
-      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
-      break;
-    }
-    return;
-  }
-  Variable *T_edx = nullptr;
-  Variable *T = nullptr;
-  switch (Inst->getOp()) {
-  case InstArithmetic::_num:
-    llvm_unreachable("Unknown arithmetic operator");
-    break;
-  case InstArithmetic::Add:
-    _mov(T, Src0);
-    _add(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::And:
-    _mov(T, Src0);
-    _and(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Or:
-    _mov(T, Src0);
-    _or(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Xor:
-    _mov(T, Src0);
-    _xor(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Sub:
-    _mov(T, Src0);
-    _sub(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Mul:
-    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-      if (optimizeScalarMul(Dest, Src0, C->getValue()))
-        return;
-    }
-    // The 8-bit version of imul only allows the form "imul r/m8"
-    // where T must be in eax.
-    if (isByteSizedArithType(Dest->getType())) {
-      _mov(T, Src0, RegX8632::Reg_eax);
-      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    } else {
-      _mov(T, Src0);
-    }
-    _imul(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Shl:
-    _mov(T, Src0);
-    if (!llvm::isa<Constant>(Src1))
-      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-    _shl(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Lshr:
-    _mov(T, Src0);
-    if (!llvm::isa<Constant>(Src1))
-      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-    _shr(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Ashr:
-    _mov(T, Src0);
-    if (!llvm::isa<Constant>(Src1))
-      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
-    _sar(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Udiv:
-    // div and idiv are the few arithmetic operators that do not allow
-    // immediates as the operand.
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = nullptr;
-      Constant *Zero = Ctx->getConstantZero(IceType_i8);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _mov(T_ah, Zero, RegX8632::Reg_ah);
-      _div(T, Src1, T_ah);
-      _mov(Dest, T);
-    } else {
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _mov(T_edx, Zero, RegX8632::Reg_edx);
-      _div(T, Src1, T_edx);
-      _mov(Dest, T);
-    }
-    break;
-  case InstArithmetic::Sdiv:
-    // TODO(stichnot): Enable this after doing better performance
-    // and cross testing.
-    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
-      // Optimize division by constant power of 2, but not for Om1
-      // or O0, just to keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        int32_t Divisor = C->getValue();
-        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
-          // LLVM does the following for dest=src/(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   sar t,log
-          //   dest=t
-          uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
-          _mov(T, Src0);
-          // If for some reason we are dividing by 1, just treat it
-          // like an assignment.
-          if (LogDiv > 0) {
-            // The initial sar is unnecessary when dividing by 2.
-            if (LogDiv > 1)
-              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-            _add(T, Src0);
-            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
-          }
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _cbwdq(T, T);
-      _idiv(T, Src1, T);
-      _mov(Dest, T);
-    } else {
-      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _cbwdq(T_edx, T);
-      _idiv(T, Src1, T_edx);
-      _mov(Dest, T);
-    }
-    break;
-  case InstArithmetic::Urem:
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = nullptr;
-      Constant *Zero = Ctx->getConstantZero(IceType_i8);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _mov(T_ah, Zero, RegX8632::Reg_ah);
-      _div(T_ah, Src1, T);
-      _mov(Dest, T_ah);
-    } else {
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(T_edx, Zero, RegX8632::Reg_edx);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _div(T_edx, Src1, T);
-      _mov(Dest, T_edx);
-    }
-    break;
-  case InstArithmetic::Srem:
-    // TODO(stichnot): Enable this after doing better performance
-    // and cross testing.
-    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
-      // Optimize mod by constant power of 2, but not for Om1 or O0,
-      // just to keep things simple there.
-      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
-        int32_t Divisor = C->getValue();
-        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
-        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
-          uint32_t LogDiv = llvm::Log2_32(UDivisor);
-          Type Ty = Dest->getType();
-          // LLVM does the following for dest=src%(1<<log):
-          //   t=src
-          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
-          //   shr t,typewidth-log
-          //   add t,src
-          //   and t, -(1<<log)
-          //   sub t,src
-          //   neg t
-          //   dest=t
-          uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
-          // If for some reason we are dividing by 1, just assign 0.
-          if (LogDiv == 0) {
-            _mov(Dest, Ctx->getConstantZero(Ty));
-            return;
-          }
-          _mov(T, Src0);
-          // The initial sar is unnecessary when dividing by 2.
-          if (LogDiv > 1)
-            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
-          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
-          _add(T, Src0);
-          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
-          _sub(T, Src0);
-          _neg(T);
-          _mov(Dest, T);
-          return;
-        }
-      }
-    }
-    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    if (isByteSizedArithType(Dest->getType())) {
-      Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _cbwdq(T, T);
-      Context.insert(InstFakeDef::create(Func, T_ah));
-      _idiv(T_ah, Src1, T);
-      _mov(Dest, T_ah);
-    } else {
-      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-      _mov(T, Src0, RegX8632::Reg_eax);
-      _cbwdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
-    }
-    break;
-  case InstArithmetic::Fadd:
-    _mov(T, Src0);
-    _addss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fsub:
-    _mov(T, Src0);
-    _subss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fmul:
-    _mov(T, Src0);
-    _mulss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Fdiv:
-    _mov(T, Src0);
-    _divss(T, Src1);
-    _mov(Dest, T);
-    break;
-  case InstArithmetic::Frem: {
-    const SizeT MaxSrcs = 2;
-    Type Ty = Dest->getType();
-    InstCall *Call = makeHelperCall(
-        isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
-    Call->addArg(Src0);
-    Call->addArg(Src1);
-    return lowerCall(Call);
-  }
-  }
-}
-
-void TargetX8632::lowerAssign(const InstAssign *Inst) {
-  Variable *Dest = Inst->getDest();
-  Operand *Src0 = Inst->getSrc(0);
-  assert(Dest->getType() == Src0->getType());
-  if (Dest->getType() == IceType_i64) {
-    Src0 = legalize(Src0);
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Variable *T_Lo = nullptr, *T_Hi = nullptr;
-    _mov(T_Lo, Src0Lo);
-    _mov(DestLo, T_Lo);
-    _mov(T_Hi, Src0Hi);
-    _mov(DestHi, T_Hi);
-  } else {
-    Operand *RI;
-    if (Dest->hasReg()) {
-      // If Dest already has a physical register, then legalize the
-      // Src operand into a Variable with the same register
-      // assignment.  This is mostly a workaround for advanced phi
-      // lowering's ad-hoc register allocation which assumes no
-      // register allocation is needed when at least one of the
-      // operands is non-memory.
-
-      // If we have a physical register for the dest variable, we can
-      // enable our constant blinding or pooling again. Note this is
-      // only for advancedPhiLowering(), the flag flip should leave
-      // no other side effect.
-      {
-        BoolFlagSaver B(RandomizationPoolingPaused, false);
-        RI = legalize(Src0, Legal_Reg, Dest->getRegNum());
-      }
-    } else {
-      // If Dest could be a stack operand, then RI must be a physical
-      // register or a scalar integer immediate.
-      RI = legalize(Src0, Legal_Reg | Legal_Imm);
-    }
-    if (isVectorType(Dest->getType()))
-      _movp(Dest, RI);
-    else
-      _mov(Dest, RI);
-  }
-}
-
-void TargetX8632::lowerBr(const InstBr *Inst) {
-  if (Inst->isUnconditional()) {
-    _br(Inst->getTargetUnconditional());
-    return;
-  }
-  Operand *Cond = Inst->getCondition();
-
-  // Handle folding opportunities.
-  if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding::PK_Icmp32: {
-      // TODO(stichnot): Refactor similarities between this block and
-      // the corresponding code in lowerIcmp().
-      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
-      Operand *Src0 = Producer->getSrc(0);
-      Operand *Src1 = legalize(Producer->getSrc(1));
-      Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
-      _cmp(Src0RM, Src1);
-      _br(getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
-          Inst->getTargetFalse());
-      return;
-    }
-    }
-  }
-
-  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
-  Constant *Zero = Ctx->getConstantZero(IceType_i32);
-  _cmp(Src0, Zero);
-  _br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
-}
-
-void TargetX8632::lowerCall(const InstCall *Instr) {
-  // x86-32 calling convention:
-  //
-  // * At the point before the call, the stack must be aligned to 16
-  // bytes.
-  //
-  // * The first four arguments of vector type, regardless of their
-  // position relative to the other arguments in the argument list, are
-  // placed in registers xmm0 - xmm3.
-  //
-  // * Other arguments are pushed onto the stack in right-to-left order,
-  // such that the left-most argument ends up on the top of the stack at
-  // the lowest memory address.
-  //
-  // * Stack arguments of vector type are aligned to start at the next
-  // highest multiple of 16 bytes.  Other stack arguments are aligned to
-  // 4 bytes.
-  //
-  // This intends to match the section "IA-32 Function Calling
-  // Convention" of the document "OS X ABI Function Call Guide" by
-  // Apple.
-  NeedsStackAlignment = true;
-
-  typedef std::vector<Operand *> OperandList;
-  OperandList XmmArgs;
-  OperandList StackArgs, StackArgLocations;
-  uint32_t ParameterAreaSizeBytes = 0;
-
-  // Classify each argument operand according to the location where the
-  // argument is passed.
-  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
-    Operand *Arg = Instr->getArg(i);
-    Type Ty = Arg->getType();
-    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
-    assert(typeWidthInBytes(Ty) >= 4);
-    if (isVectorType(Ty) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
-      XmmArgs.push_back(Arg);
-    } else {
-      StackArgs.push_back(Arg);
-      if (isVectorType(Arg->getType())) {
-        ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
-      }
-      Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
-      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
-      StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));
-      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
-    }
-  }
-
-  // Adjust the parameter area so that the stack is aligned.  It is
-  // assumed that the stack is already aligned at the start of the
-  // calling sequence.
-  ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
-
-  // Subtract the appropriate amount for the argument area.  This also
-  // takes care of setting the stack adjustment during emission.
-  //
-  // TODO: If for some reason the call instruction gets dead-code
-  // eliminated after lowering, we would need to ensure that the
-  // pre-call and the post-call esp adjustment get eliminated as well.
-  if (ParameterAreaSizeBytes) {
-    _adjust_stack(ParameterAreaSizeBytes);
-  }
-
-  // Copy arguments that are passed on the stack to the appropriate
-  // stack locations.
-  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
-    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
-  }
-
-  // Copy arguments to be passed in registers to the appropriate
-  // registers.
-  // TODO: Investigate the impact of lowering arguments passed in
-  // registers after lowering stack arguments as opposed to the other
-  // way around.  Lowering register arguments after stack arguments may
-  // reduce register pressure.  On the other hand, lowering register
-  // arguments first (before stack arguments) may result in more compact
-  // code, as the memory operand displacements may end up being smaller
-  // before any stack adjustment is done.
-  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
-    Variable *Reg = legalizeToVar(XmmArgs[i], RegX8632::Reg_xmm0 + i);
-    // Generate a FakeUse of register arguments so that they do not get
-    // dead code eliminated as a result of the FakeKill of scratch
-    // registers after the call.
-    Context.insert(InstFakeUse::create(Func, Reg));
-  }
-  // Generate the call instruction.  Assign its result to a temporary
-  // with high register allocation weight.
-  Variable *Dest = Instr->getDest();
-  // ReturnReg doubles as ReturnRegLo as necessary.
-  Variable *ReturnReg = nullptr;
-  Variable *ReturnRegHi = nullptr;
-  if (Dest) {
-    switch (Dest->getType()) {
-    case IceType_NUM:
-      llvm_unreachable("Invalid Call dest type");
-      break;
-    case IceType_void:
-      break;
-    case IceType_i1:
-    case IceType_i8:
-    case IceType_i16:
-    case IceType_i32:
-      ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_eax);
-      break;
-    case IceType_i64:
-      ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
-      ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
-      break;
-    case IceType_f32:
-    case IceType_f64:
-      // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
-      // the fstp instruction.
-      break;
-    case IceType_v4i1:
-    case IceType_v8i1:
-    case IceType_v16i1:
-    case IceType_v16i8:
-    case IceType_v8i16:
-    case IceType_v4i32:
-    case IceType_v4f32:
-      ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_xmm0);
-      break;
-    }
-  }
-  Operand *CallTarget = legalize(Instr->getCallTarget());
-  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
-  if (NeedSandboxing) {
-    if (llvm::isa<Constant>(CallTarget)) {
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-    } else {
-      Variable *CallTargetVar = nullptr;
-      _mov(CallTargetVar, CallTarget);
-      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
-      const SizeT BundleSize =
-          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
-      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
-      CallTarget = CallTargetVar;
-    }
-  }
-  Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
-  Context.insert(NewCall);
-  if (NeedSandboxing)
-    _bundle_unlock();
-  if (ReturnRegHi)
-    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
-
-  // Add the appropriate offset to esp.  The call instruction takes care
-  // of resetting the stack offset during emission.
-  if (ParameterAreaSizeBytes) {
-    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
-    _add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
-  }
-
-  // Insert a register-kill pseudo instruction.
-  Context.insert(InstFakeKill::create(Func, NewCall));
-
-  // Generate a FakeUse to keep the call live if necessary.
-  if (Instr->hasSideEffects() && ReturnReg) {
-    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
-    Context.insert(FakeUse);
-  }
-
-  if (!Dest)
-    return;
-
-  // Assign the result of the call to Dest.
-  if (ReturnReg) {
-    if (ReturnRegHi) {
-      assert(Dest->getType() == IceType_i64);
-      split64(Dest);
-      Variable *DestLo = Dest->getLo();
-      Variable *DestHi = Dest->getHi();
-      _mov(DestLo, ReturnReg);
-      _mov(DestHi, ReturnRegHi);
-    } else {
-      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-             isVectorType(Dest->getType()));
-      if (isVectorType(Dest->getType())) {
-        _movp(Dest, ReturnReg);
-      } else {
-        _mov(Dest, ReturnReg);
-      }
-    }
-  } else if (isScalarFloatingType(Dest->getType())) {
-    // Special treatment for an FP function which returns its result in
-    // st(0).
-    // If Dest ends up being a physical xmm register, the fstp emit code
-    // will route st(0) through a temporary stack slot.
-    _fstp(Dest);
-    // Create a fake use of Dest in case it actually isn't used,
-    // because st(0) still needs to be popped.
-    Context.insert(InstFakeUse::create(Func, Dest));
-  }
-}
-
-void TargetX8632::lowerCast(const InstCast *Inst) {
-  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
-  InstCast::OpKind CastKind = Inst->getCastKind();
-  Variable *Dest = Inst->getDest();
-  switch (CastKind) {
-  default:
-    Func->setError("Cast type not supported");
-    return;
-  case InstCast::Sext: {
-    // Src0RM is the source operand legalized to physical register or memory,
-    // but not immediate, since the relevant x86 native instructions don't
-    // allow an immediate operand.  If the operand is an immediate, we could
-    // consider computing the strength-reduced result at translation time,
-    // but we're unlikely to see something like that in the bitcode that
-    // the optimizer wouldn't have already taken care of.
-    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
-      Type DestTy = Dest->getType();
-      if (DestTy == IceType_v16i8) {
-        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
-        Variable *OneMask = makeVectorOfOnes(Dest->getType());
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _pand(T, OneMask);
-        Variable *Zeros = makeVectorOfZeros(Dest->getType());
-        _pcmpgt(T, Zeros);
-        _movp(Dest, T);
-      } else {
-        // width = width(elty) - 1; dest = (src << width) >> width
-        SizeT ShiftAmount =
-            X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
-        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
-        Variable *T = makeReg(DestTy);
-        _movp(T, Src0RM);
-        _psll(T, ShiftConstant);
-        _psra(T, ShiftConstant);
-        _movp(Dest, T);
-      }
-    } else if (Dest->getType() == IceType_i64) {
-      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
-      Constant *Shift = Ctx->getConstantInt32(31);
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(T_Lo, Src0RM);
-      } else if (Src0RM->getType() == IceType_i1) {
-        _movzx(T_Lo, Src0RM);
-        _shl(T_Lo, Shift);
-        _sar(T_Lo, Shift);
-      } else {
-        _movsx(T_Lo, Src0RM);
-      }
-      _mov(DestLo, T_Lo);
-      Variable *T_Hi = nullptr;
-      _mov(T_Hi, T_Lo);
-      if (Src0RM->getType() != IceType_i1)
-        // For i1, the sar instruction is already done above.
-        _sar(T_Hi, Shift);
-      _mov(DestHi, T_Hi);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t1 = src
-      // shl t1, dst_bitwidth - 1
-      // sar t1, dst_bitwidth - 1
-      // dst = t1
-      size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
-      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
-      Variable *T = makeReg(Dest->getType());
-      if (typeWidthInBytes(Dest->getType()) <=
-          typeWidthInBytes(Src0RM->getType())) {
-        _mov(T, Src0RM);
-      } else {
-        // Widen the source using movsx or movzx.  (It doesn't matter
-        // which one, since the following shl/sar overwrite the bits.)
-        _movzx(T, Src0RM);
-      }
-      _shl(T, ShiftAmount);
-      _sar(T, ShiftAmount);
-      _mov(Dest, T);
-    } else {
-      // t1 = movsx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
-      _movsx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Zext: {
-    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    if (isVectorType(Dest->getType())) {
-      // onemask = materialize(1,1,...); dest = onemask & src
-      Type DestTy = Dest->getType();
-      Variable *OneMask = makeVectorOfOnes(DestTy);
-      Variable *T = makeReg(DestTy);
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
-      // t1=movzx src; dst.lo=t1; dst.hi=0
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *Tmp = makeReg(DestLo->getType());
-      if (Src0RM->getType() == IceType_i32) {
-        _mov(Tmp, Src0RM);
-      } else {
-        _movzx(Tmp, Src0RM);
-      }
-      if (Src0RM->getType() == IceType_i1) {
-        Constant *One = Ctx->getConstantInt32(1);
-        _and(Tmp, One);
-      }
-      _mov(DestLo, Tmp);
-      _mov(DestHi, Zero);
-    } else if (Src0RM->getType() == IceType_i1) {
-      // t = Src0RM; t &= 1; Dest = t
-      Constant *One = Ctx->getConstantInt32(1);
-      Type DestTy = Dest->getType();
-      Variable *T;
-      if (DestTy == IceType_i8) {
-        T = makeReg(DestTy);
-        _mov(T, Src0RM);
-      } else {
-        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
-        T = makeReg(IceType_i32);
-        _movzx(T, Src0RM);
-      }
-      _and(T, One);
-      _mov(Dest, T);
-    } else {
-      // t1 = movzx src; dst = t1
-      Variable *T = makeReg(Dest->getType());
-      _movzx(T, Src0RM);
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Trunc: {
-    if (isVectorType(Dest->getType())) {
-      // onemask = materialize(1,1,...); dst = src & onemask
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      Type Src0Ty = Src0RM->getType();
-      Variable *OneMask = makeVectorOfOnes(Src0Ty);
-      Variable *T = makeReg(Dest->getType());
-      _movp(T, Src0RM);
-      _pand(T, OneMask);
-      _movp(Dest, T);
-    } else {
-      Operand *Src0 = Inst->getSrc(0);
-      if (Src0->getType() == IceType_i64)
-        Src0 = loOperand(Src0);
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // t1 = trunc Src0RM; Dest = t1
-      Variable *T = nullptr;
-      _mov(T, Src0RM);
-      if (Dest->getType() == IceType_i1)
-        _and(T, Ctx->getConstantInt1(1));
-      _mov(Dest, T);
-    }
-    break;
-  }
-  case InstCast::Fptrunc:
-  case InstCast::Fpext: {
-    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-    // t1 = cvt Src0RM; Dest = t1
-    Variable *T = makeReg(Dest->getType());
-    _cvt(T, Src0RM, InstX8632Cvt::Float2float);
-    _mov(Dest, T);
-    break;
-  }
-  case InstCast::Fptosi:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
-             Inst->getSrc(0)->getType() == IceType_v4f32);
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      if (llvm::isa<OperandX8632Mem>(Src0RM))
-        Src0RM = legalizeToVar(Src0RM);
-      Variable *T = makeReg(Dest->getType());
-      _cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
-      _movp(Dest, T);
-    } else if (Dest->getType() == IceType_i64) {
-      // Use a helper for converting floating-point values to 64-bit
-      // integers.  SSE2 appears to have no way to convert from xmm
-      // registers to something like the edx:eax register pair, and
-      // gcc and clang both want to use x87 instructions complete with
-      // temporary manipulation of the status word.  This helper is
-      // not needed for x86-64.
-      split64(Dest);
-      const SizeT MaxSrcs = 1;
-      Type SrcType = Inst->getSrc(0)->getType();
-      InstCall *Call =
-          makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64
-                                                           : H_fptosi_f64_i64,
-                         Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      lowerCall(Call);
-    } else {
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
-      Variable *T_2 = makeReg(Dest->getType());
-      _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Fptoui:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4i32 &&
-             Inst->getSrc(0)->getType() == IceType_v4f32);
-      const SizeT MaxSrcs = 1;
-      InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      lowerCall(Call);
-    } else if (Dest->getType() == IceType_i64 ||
-               Dest->getType() == IceType_i32) {
-      // Use a helper for both x86-32 and x86-64.
-      split64(Dest);
-      const SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
-      Type SrcType = Inst->getSrc(0)->getType();
-      IceString TargetString;
-      if (isInt32Asserting32Or64(DestType)) {
-        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
-                                                         : H_fptoui_f64_i32;
-      } else {
-        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
-                                                         : H_fptoui_f64_i64;
-      }
-      InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
-      Call->addArg(Inst->getSrc(0));
-      lowerCall(Call);
-      return;
-    } else {
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
-      Variable *T_1 = makeReg(IceType_i32);
-      Variable *T_2 = makeReg(Dest->getType());
-      _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
-      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
-      if (Dest->getType() == IceType_i1)
-        _and(T_2, Ctx->getConstantInt1(1));
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Sitofp:
-    if (isVectorType(Dest->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
-             Inst->getSrc(0)->getType() == IceType_v4i32);
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      if (llvm::isa<OperandX8632Mem>(Src0RM))
-        Src0RM = legalizeToVar(Src0RM);
-      Variable *T = makeReg(Dest->getType());
-      _cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
-      _movp(Dest, T);
-    } else if (Inst->getSrc(0)->getType() == IceType_i64) {
-      // Use a helper for x86-32.
-      const SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
-      InstCall *Call =
-          makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
-                                                            : H_sitofp_i64_f64,
-                         Dest, MaxSrcs);
-      // TODO: Call the correct compiler-rt helper function.
-      Call->addArg(Inst->getSrc(0));
-      lowerCall(Call);
-      return;
-    } else {
-      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
-      // Sign-extend the operand.
-      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
-      Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
-        _mov(T_1, Src0RM);
-      else
-        _movsx(T_1, Src0RM);
-      _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  case InstCast::Uitofp: {
-    Operand *Src0 = Inst->getSrc(0);
-    if (isVectorType(Src0->getType())) {
-      assert(Dest->getType() == IceType_v4f32 &&
-             Src0->getType() == IceType_v4i32);
-      const SizeT MaxSrcs = 1;
-      InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      lowerCall(Call);
-    } else if (Src0->getType() == IceType_i64 ||
-               Src0->getType() == IceType_i32) {
-      // Use a helper for x86-32 and x86-64.  Also use a helper for
-      // i32 on x86-32.
-      const SizeT MaxSrcs = 1;
-      Type DestType = Dest->getType();
-      IceString TargetString;
-      if (isInt32Asserting32Or64(Src0->getType())) {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
-                                                          : H_uitofp_i32_f64;
-      } else {
-        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
-                                                          : H_uitofp_i64_f64;
-      }
-      InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      lowerCall(Call);
-      return;
-    } else {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      // Zero-extend the operand.
-      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
-      Variable *T_1 = makeReg(IceType_i32);
-      Variable *T_2 = makeReg(Dest->getType());
-      if (Src0RM->getType() == IceType_i32)
-        _mov(T_1, Src0RM);
-      else
-        _movzx(T_1, Src0RM);
-      _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
-      _mov(Dest, T_2);
-    }
-    break;
-  }
-  case InstCast::Bitcast: {
-    Operand *Src0 = Inst->getSrc(0);
-    if (Dest->getType() == Src0->getType()) {
-      InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
-      lowerAssign(Assign);
-      return;
-    }
-    switch (Dest->getType()) {
-    default:
-      llvm_unreachable("Unexpected Bitcast dest type");
-    case IceType_i8: {
-      assert(Src0->getType() == IceType_v8i1);
-      InstCall *Call = makeHelperCall(H_bitcast_8xi1_i8, Dest, 1);
-      Call->addArg(Src0);
-      lowerCall(Call);
-    } break;
-    case IceType_i16: {
-      assert(Src0->getType() == IceType_v16i1);
-      InstCall *Call = makeHelperCall(H_bitcast_16xi1_i16, Dest, 1);
-      Call->addArg(Src0);
-      lowerCall(Call);
-    } break;
-    case IceType_i32:
-    case IceType_f32: {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Type DestType = Dest->getType();
-      Type SrcType = Src0RM->getType();
-      (void)DestType;
-      assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
-             (DestType == IceType_f32 && SrcType == IceType_i32));
-      // a.i32 = bitcast b.f32 ==>
-      //   t.f32 = b.f32
-      //   s.f32 = spill t.f32
-      //   a.i32 = s.f32
-      Variable *T = nullptr;
-      // TODO: Should be able to force a spill setup by calling legalize() with
-      // Legal_Mem and not Legal_Reg or Legal_Imm.
-      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(SrcType);
-      SpillVar->setLinkedTo(Dest);
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
-      _mov(T, Src0RM);
-      _mov(Spill, T);
-      _mov(Dest, Spill);
-    } break;
-    case IceType_i64: {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      assert(Src0RM->getType() == IceType_f64);
-      // a.i64 = bitcast b.f64 ==>
-      //   s.f64 = spill b.f64
-      //   t_lo.i32 = lo(s.f64)
-      //   a_lo.i32 = t_lo.i32
-      //   t_hi.i32 = hi(s.f64)
-      //   a_hi.i32 = t_hi.i32
-      Operand *SpillLo, *SpillHi;
-      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
-        SpillVariable *SpillVar =
-            Func->makeVariable<SpillVariable>(IceType_f64);
-        SpillVar->setLinkedTo(Src0Var);
-        Variable *Spill = SpillVar;
-        Spill->setWeight(RegWeight::Zero);
-        _movq(Spill, Src0RM);
-        SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
-        SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
-      } else {
-        SpillLo = loOperand(Src0RM);
-        SpillHi = hiOperand(Src0RM);
-      }
-
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Variable *T_Lo = makeReg(IceType_i32);
-      Variable *T_Hi = makeReg(IceType_i32);
-
-      _mov(T_Lo, SpillLo);
-      _mov(DestLo, T_Lo);
-      _mov(T_Hi, SpillHi);
-      _mov(DestHi, T_Hi);
-    } break;
-    case IceType_f64: {
-      Src0 = legalize(Src0);
-      assert(Src0->getType() == IceType_i64);
-      if (llvm::isa<OperandX8632Mem>(Src0)) {
-        Variable *T = Func->makeVariable(Dest->getType());
-        _movq(T, Src0);
-        _movq(Dest, T);
-        break;
-      }
-      // a.f64 = bitcast b.i64 ==>
-      //   t_lo.i32 = b_lo.i32
-      //   FakeDef(s.f64)
-      //   lo(s.f64) = t_lo.i32
-      //   t_hi.i32 = b_hi.i32
-      //   hi(s.f64) = t_hi.i32
-      //   a.f64 = s.f64
-      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
-      SpillVar->setLinkedTo(Dest);
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
-
-      Variable *T_Lo = nullptr, *T_Hi = nullptr;
-      VariableSplit *SpillLo =
-          VariableSplit::create(Func, Spill, VariableSplit::Low);
-      VariableSplit *SpillHi =
-          VariableSplit::create(Func, Spill, VariableSplit::High);
-      _mov(T_Lo, loOperand(Src0));
-      // Technically, the Spill is defined after the _store happens, but
-      // SpillLo is considered a "use" of Spill so define Spill before it
-      // is used.
-      Context.insert(InstFakeDef::create(Func, Spill));
-      _store(T_Lo, SpillLo);
-      _mov(T_Hi, hiOperand(Src0));
-      _store(T_Hi, SpillHi);
-      _movq(Dest, Spill);
-    } break;
-    case IceType_v8i1: {
-      assert(Src0->getType() == IceType_i8);
-      InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1);
-      Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-      // Arguments to functions are required to be at least 32 bits wide.
-      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
-      Call->addArg(Src0AsI32);
-      lowerCall(Call);
-    } break;
-    case IceType_v16i1: {
-      assert(Src0->getType() == IceType_i16);
-      InstCall *Call = makeHelperCall(H_bitcast_i16_16xi1, Dest, 1);
-      Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
-      // Arguments to functions are required to be at least 32 bits wide.
-      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
-      Call->addArg(Src0AsI32);
-      lowerCall(Call);
-    } break;
-    case IceType_v8i16:
-    case IceType_v16i8:
-    case IceType_v4i32:
-    case IceType_v4f32: {
-      _movp(Dest, legalizeToVar(Src0));
-    } break;
-    }
-    break;
-  }
-  }
-}
-
-void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
-  Operand *SourceVectNotLegalized = Inst->getSrc(0);
-  ConstantInteger32 *ElementIndex =
-      llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(1));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-
-  unsigned Index = ElementIndex->getValue();
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = getInVectorElementType(Ty);
-  Variable *ExtractedElementR = makeReg(InVectorElementTy);
-
-  // TODO(wala): Determine the best lowering sequences for each type.
-  bool CanUsePextr =
-      Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
-  if (CanUsePextr && Ty != IceType_v4f32) {
-    // Use pextrb, pextrw, or pextrd.
-    Constant *Mask = Ctx->getConstantInt32(Index);
-    Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized);
-    _pextr(ExtractedElementR, SourceVectR, Mask);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use pshufd and movd/movss.
-    Variable *T = nullptr;
-    if (Index) {
-      // The shuffle only needs to occur if the element to be extracted
-      // is not at the lowest index.
-      Constant *Mask = Ctx->getConstantInt32(Index);
-      T = makeReg(Ty);
-      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
-    } else {
-      T = legalizeToVar(SourceVectNotLegalized);
-    }
-
-    if (InVectorElementTy == IceType_i32) {
-      _movd(ExtractedElementR, T);
-    } else { // Ty == IceType_f32
-      // TODO(wala): _movss is only used here because _mov does not
-      // allow a vector source and a scalar destination.  _mov should be
-      // able to be used here.
-      // _movss is a binary instruction, so the FakeDef is needed to
-      // keep the live range analysis consistent.
-      Context.insert(InstFakeDef::create(Func, ExtractedElementR));
-      _movss(ExtractedElementR, T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and do the extraction in memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
-    // support for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setWeight(RegWeight::Zero);
-    _movp(Slot, legalizeToVar(SourceVectNotLegalized));
-
-    // Compute the location of the element in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    OperandX8632Mem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _mov(ExtractedElementR, Loc);
-  }
-
-  if (ElementTy == IceType_i1) {
-    // Truncate extracted integers to i1s if necessary.
-    Variable *T = makeReg(IceType_i1);
-    InstCast *Cast =
-        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
-    lowerCast(Cast);
-    ExtractedElementR = T;
-  }
-
-  // Copy the element to the destination.
-  Variable *Dest = Inst->getDest();
-  _mov(Dest, ExtractedElementR);
-}
-
-void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
-  Operand *Src0 = Inst->getSrc(0);
-  Operand *Src1 = Inst->getSrc(1);
-  Variable *Dest = Inst->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    InstFcmp::FCond Condition = Inst->getCondition();
-    size_t Index = static_cast<size_t>(Condition);
-    assert(Index < TableFcmpSize);
-
-    if (TableFcmp[Index].SwapVectorOperands) {
-      Operand *T = Src0;
-      Src0 = Src1;
-      Src1 = T;
-    }
-
-    Variable *T = nullptr;
-
-    if (Condition == InstFcmp::True) {
-      // makeVectorOfOnes() requires an integer vector type.
-      T = makeVectorOfMinusOnes(IceType_v4i32);
-    } else if (Condition == InstFcmp::False) {
-      T = makeVectorOfZeros(Dest->getType());
-    } else {
-      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-      Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-      if (llvm::isa<OperandX8632Mem>(Src1RM))
-        Src1RM = legalizeToVar(Src1RM);
-
-      switch (Condition) {
-      default: {
-        CondX86::CmppsCond Predicate = TableFcmp[Index].Predicate;
-        assert(Predicate != CondX86::Cmpps_Invalid);
-        T = makeReg(Src0RM->getType());
-        _movp(T, Src0RM);
-        _cmpps(T, Src1RM, Predicate);
-      } break;
-      case InstFcmp::One: {
-        // Check both unequal and ordered.
-        T = makeReg(Src0RM->getType());
-        Variable *T2 = makeReg(Src0RM->getType());
-        _movp(T, Src0RM);
-        _cmpps(T, Src1RM, CondX86::Cmpps_neq);
-        _movp(T2, Src0RM);
-        _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
-        _pand(T, T2);
-      } break;
-      case InstFcmp::Ueq: {
-        // Check both equal or unordered.
-        T = makeReg(Src0RM->getType());
-        Variable *T2 = makeReg(Src0RM->getType());
-        _movp(T, Src0RM);
-        _cmpps(T, Src1RM, CondX86::Cmpps_eq);
-        _movp(T2, Src0RM);
-        _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
-        _por(T, T2);
-      } break;
-      }
-    }
-
-    _movp(Dest, T);
-    eliminateNextVectorSextInstruction(Dest);
-    return;
-  }
-
-  // Lowering a = fcmp cond, b, c
-  //   ucomiss b, c       /* only if C1 != Br_None */
-  //                      /* but swap b,c order if SwapOperands==true */
-  //   mov a, <default>
-  //   j<C1> label        /* only if C1 != Br_None */
-  //   j<C2> label        /* only if C2 != Br_None */
-  //   FakeUse(a)         /* only if C1 != Br_None */
-  //   mov a, !<default>  /* only if C1 != Br_None */
-  //   label:             /* only if C1 != Br_None */
-  //
-  // setcc lowering when C1 != Br_None && C2 == Br_None:
-  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
-  //   setcc a, C1
-  InstFcmp::FCond Condition = Inst->getCondition();
-  size_t Index = static_cast<size_t>(Condition);
-  assert(Index < TableFcmpSize);
-  if (TableFcmp[Index].SwapScalarOperands)
-    std::swap(Src0, Src1);
-  bool HasC1 = (TableFcmp[Index].C1 != CondX86::Br_None);
-  bool HasC2 = (TableFcmp[Index].C2 != CondX86::Br_None);
-  if (HasC1) {
-    Src0 = legalize(Src0);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-    Variable *T = nullptr;
-    _mov(T, Src0);
-    _ucomiss(T, Src1RM);
-    if (!HasC2) {
-      assert(TableFcmp[Index].Default);
-      _setcc(Dest, TableFcmp[Index].C1);
-      return;
-    }
-  }
-  Constant *Default = Ctx->getConstantInt32(TableFcmp[Index].Default);
-  _mov(Dest, Default);
-  if (HasC1) {
-    InstX8632Label *Label = InstX8632Label::create(Func, this);
-    _br(TableFcmp[Index].C1, Label);
-    if (HasC2) {
-      _br(TableFcmp[Index].C2, Label);
-    }
-    Constant *NonDefault = Ctx->getConstantInt32(!TableFcmp[Index].Default);
-    _mov_nonkillable(Dest, NonDefault);
-    Context.insert(Label);
-  }
-}
-
-void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
-  Operand *Src0 = legalize(Inst->getSrc(0));
-  Operand *Src1 = legalize(Inst->getSrc(1));
-  Variable *Dest = Inst->getDest();
-
-  if (isVectorType(Dest->getType())) {
-    Type Ty = Src0->getType();
-    // Promote i1 vectors to 128 bit integer vector types.
-    if (typeElementType(Ty) == IceType_i1) {
-      Type NewTy = IceType_NUM;
-      switch (Ty) {
-      default:
-        llvm_unreachable("unexpected type");
-        break;
-      case IceType_v4i1:
-        NewTy = IceType_v4i32;
-        break;
-      case IceType_v8i1:
-        NewTy = IceType_v8i16;
-        break;
-      case IceType_v16i1:
-        NewTy = IceType_v16i8;
-        break;
-      }
-      Variable *NewSrc0 = Func->makeVariable(NewTy);
-      Variable *NewSrc1 = Func->makeVariable(NewTy);
-      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
-      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
-      Src0 = NewSrc0;
-      Src1 = NewSrc1;
-      Ty = NewTy;
-    }
-
-    InstIcmp::ICond Condition = Inst->getCondition();
-
-    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
-    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
-
-    // SSE2 only has signed comparison operations.  Transform unsigned
-    // inputs in a manner that allows for the use of signed comparison
-    // operations by flipping the high order bits.
-    if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
-        Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
-      Variable *T0 = makeReg(Ty);
-      Variable *T1 = makeReg(Ty);
-      Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
-      _movp(T0, Src0RM);
-      _pxor(T0, HighOrderBits);
-      _movp(T1, Src1RM);
-      _pxor(T1, HighOrderBits);
-      Src0RM = T0;
-      Src1RM = T1;
-    }
-
-    Variable *T = makeReg(Ty);
-    switch (Condition) {
-    default:
-      llvm_unreachable("unexpected condition");
-      break;
-    case InstIcmp::Eq: {
-      if (llvm::isa<OperandX8632Mem>(Src1RM))
-        Src1RM = legalizeToVar(Src1RM);
-      _movp(T, Src0RM);
-      _pcmpeq(T, Src1RM);
-    } break;
-    case InstIcmp::Ne: {
-      if (llvm::isa<OperandX8632Mem>(Src1RM))
-        Src1RM = legalizeToVar(Src1RM);
-      _movp(T, Src0RM);
-      _pcmpeq(T, Src1RM);
-      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-      _pxor(T, MinusOne);
-    } break;
-    case InstIcmp::Ugt:
-    case InstIcmp::Sgt: {
-      if (llvm::isa<OperandX8632Mem>(Src1RM))
-        Src1RM = legalizeToVar(Src1RM);
-      _movp(T, Src0RM);
-      _pcmpgt(T, Src1RM);
-    } break;
-    case InstIcmp::Uge:
-    case InstIcmp::Sge: {
-      // !(Src1RM > Src0RM)
-      if (llvm::isa<OperandX8632Mem>(Src0RM))
-        Src0RM = legalizeToVar(Src0RM);
-      _movp(T, Src1RM);
-      _pcmpgt(T, Src0RM);
-      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-      _pxor(T, MinusOne);
-    } break;
-    case InstIcmp::Ult:
-    case InstIcmp::Slt: {
-      if (llvm::isa<OperandX8632Mem>(Src0RM))
-        Src0RM = legalizeToVar(Src0RM);
-      _movp(T, Src1RM);
-      _pcmpgt(T, Src0RM);
-    } break;
-    case InstIcmp::Ule:
-    case InstIcmp::Sle: {
-      // !(Src0RM > Src1RM)
-      if (llvm::isa<OperandX8632Mem>(Src1RM))
-        Src1RM = legalizeToVar(Src1RM);
-      _movp(T, Src0RM);
-      _pcmpgt(T, Src1RM);
-      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-      _pxor(T, MinusOne);
-    } break;
-    }
-
-    _movp(Dest, T);
-    eliminateNextVectorSextInstruction(Dest);
-    return;
-  }
-
-  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
-  if (Src0->getType() == IceType_i64) {
-    InstIcmp::ICond Condition = Inst->getCondition();
-    size_t Index = static_cast<size_t>(Condition);
-    assert(Index < TableIcmp64Size);
-    Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
-    Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
-    Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
-    Constant *Zero = Ctx->getConstantZero(IceType_i32);
-    Constant *One = Ctx->getConstantInt32(1);
-    InstX8632Label *LabelFalse = InstX8632Label::create(Func, this);
-    InstX8632Label *LabelTrue = InstX8632Label::create(Func, this);
-    _mov(Dest, One);
-    _cmp(Src0HiRM, Src1HiRI);
-    if (TableIcmp64[Index].C1 != CondX86::Br_None)
-      _br(TableIcmp64[Index].C1, LabelTrue);
-    if (TableIcmp64[Index].C2 != CondX86::Br_None)
-      _br(TableIcmp64[Index].C2, LabelFalse);
-    _cmp(Src0LoRM, Src1LoRI);
-    _br(TableIcmp64[Index].C3, LabelTrue);
-    Context.insert(LabelFalse);
-    _mov_nonkillable(Dest, Zero);
-    Context.insert(LabelTrue);
-    return;
-  }
-
-  // cmp b, c
-  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
-  _cmp(Src0RM, Src1);
-  _setcc(Dest, getIcmp32Mapping(Inst->getCondition()));
-}
-
-void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
-  Operand *SourceVectNotLegalized = Inst->getSrc(0);
-  Operand *ElementToInsertNotLegalized = Inst->getSrc(1);
-  ConstantInteger32 *ElementIndex =
-      llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(2));
-  // Only constant indices are allowed in PNaCl IR.
-  assert(ElementIndex);
-  unsigned Index = ElementIndex->getValue();
-  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
-
-  Type Ty = SourceVectNotLegalized->getType();
-  Type ElementTy = typeElementType(Ty);
-  Type InVectorElementTy = getInVectorElementType(Ty);
-
-  if (ElementTy == IceType_i1) {
-    // Expand the element to the appropriate size for it to be inserted
-    // in the vector.
-    Variable *Expanded = Func->makeVariable(InVectorElementTy);
-    InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
-                                      ElementToInsertNotLegalized);
-    lowerCast(Cast);
-    ElementToInsertNotLegalized = Expanded;
-  }
-
-  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
-    // Use insertps, pinsrb, pinsrw, or pinsrd.
-    Operand *ElementRM =
-        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-    Variable *T = makeReg(Ty);
-    _movp(T, SourceVectRM);
-    if (Ty == IceType_v4f32)
-      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
-    else
-      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
-    _movp(Inst->getDest(), T);
-  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Use shufps or movss.
-    Variable *ElementR = nullptr;
-    Operand *SourceVectRM =
-        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
-
-    if (InVectorElementTy == IceType_f32) {
-      // ElementR will be in an XMM register since it is floating point.
-      ElementR = legalizeToVar(ElementToInsertNotLegalized);
-    } else {
-      // Copy an integer to an XMM register.
-      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
-      ElementR = makeReg(Ty);
-      _movd(ElementR, T);
-    }
-
-    if (Index == 0) {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _movss(T, ElementR);
-      _movp(Inst->getDest(), T);
-      return;
-    }
-
-    // shufps treats the source and desination operands as vectors of
-    // four doublewords.  The destination's two high doublewords are
-    // selected from the source operand and the two low doublewords are
-    // selected from the (original value of) the destination operand.
-    // An insertelement operation can be effected with a sequence of two
-    // shufps operations with appropriate masks.  In all cases below,
-    // Element[0] is being inserted into SourceVectOperand.  Indices are
-    // ordered from left to right.
-    //
-    // insertelement into index 1 (result is stored in ElementR):
-    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
-    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
-    //
-    // insertelement into index 2 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 3]
-    //   T := T[0, 1] ElementR[0, 3]
-    //
-    // insertelement into index 3 (result is stored in T):
-    //   T := SourceVectRM
-    //   ElementR := ElementR[0, 0] T[0, 2]
-    //   T := T[0, 1] ElementR[3, 0]
-    const unsigned char Mask1[3] = {0, 192, 128};
-    const unsigned char Mask2[3] = {227, 196, 52};
-
-    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
-    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
-
-    if (Index == 1) {
-      _shufps(ElementR, SourceVectRM, Mask1Constant);
-      _shufps(ElementR, SourceVectRM, Mask2Constant);
-      _movp(Inst->getDest(), ElementR);
-    } else {
-      Variable *T = makeReg(Ty);
-      _movp(T, SourceVectRM);
-      _shufps(ElementR, T, Mask1Constant);
-      _shufps(T, ElementR, Mask2Constant);
-      _movp(Inst->getDest(), T);
-    }
-  } else {
-    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
-    // Spill the value to a stack slot and perform the insertion in
-    // memory.
-    //
-    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
-    // support for legalizing to mem is implemented.
-    Variable *Slot = Func->makeVariable(Ty);
-    Slot->setWeight(RegWeight::Zero);
-    _movp(Slot, legalizeToVar(SourceVectNotLegalized));
-
-    // Compute the location of the position to insert in memory.
-    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
-    OperandX8632Mem *Loc =
-        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
-    _store(legalizeToVar(ElementToInsertNotLegalized), Loc);
-
-    Variable *T = makeReg(Ty);
-    _movp(T, Slot);
-    _movp(Inst->getDest(), T);
-  }
-}
-
-void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
-  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
-  case Intrinsics::AtomicCmpxchg: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)),
-            getConstantMemoryOrder(Instr->getArg(4)))) {
-      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
-      return;
-    }
-    Variable *DestPrev = Instr->getDest();
-    Operand *PtrToMem = Instr->getArg(0);
-    Operand *Expected = Instr->getArg(1);
-    Operand *Desired = Instr->getArg(2);
-    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
-      return;
-    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
-    return;
-  }
-  case Intrinsics::AtomicFence:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
-      Func->setError("Unexpected memory ordering for AtomicFence");
-      return;
-    }
-    _mfence();
-    return;
-  case Intrinsics::AtomicFenceAll:
-    // NOTE: FenceAll should prevent and load/store from being moved
-    // across the fence (both atomic and non-atomic). The InstX8632Mfence
-    // instruction is currently marked coarsely as "HasSideEffects".
-    _mfence();
-    return;
-  case Intrinsics::AtomicIsLockFree: {
-    // X86 is always lock free for 8/16/32/64 bit accesses.
-    // TODO(jvoung): Since the result is constant when given a constant
-    // byte size, this opens up DCE opportunities.
-    Operand *ByteSize = Instr->getArg(0);
-    Variable *Dest = Instr->getDest();
-    if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
-      Constant *Result;
-      switch (CI->getValue()) {
-      default:
-        // Some x86-64 processors support the cmpxchg16b intruction, which
-        // can make 16-byte operations lock free (when used with the LOCK
-        // prefix). However, that's not supported in 32-bit mode, so just
-        // return 0 even for large sizes.
-        Result = Ctx->getConstantZero(IceType_i32);
-        break;
-      case 1:
-      case 2:
-      case 4:
-      case 8:
-        Result = Ctx->getConstantInt32(1);
-        break;
-      }
-      _mov(Dest, Result);
-      return;
-    }
-    // The PNaCl ABI requires the byte size to be a compile-time constant.
-    Func->setError("AtomicIsLockFree byte size should be compile-time const");
-    return;
-  }
-  case Intrinsics::AtomicLoad: {
-    // We require the memory address to be naturally aligned.
-    // Given that is the case, then normal loads are atomic.
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
-      Func->setError("Unexpected memory ordering for AtomicLoad");
-      return;
-    }
-    Variable *Dest = Instr->getDest();
-    if (Dest->getType() == IceType_i64) {
-      // Follow what GCC does and use a movq instead of what lowerLoad()
-      // normally does (split the load into two).
-      // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
-      // can't happen anyway, since this is x86-32 and integer arithmetic only
-      // happens on 32-bit quantities.
-      Variable *T = makeReg(IceType_f64);
-      OperandX8632Mem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
-      _movq(T, Addr);
-      // Then cast the bits back out of the XMM register to the i64 Dest.
-      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
-      lowerCast(Cast);
-      // Make sure that the atomic load isn't elided when unused.
-      Context.insert(InstFakeUse::create(Func, Dest->getLo()));
-      Context.insert(InstFakeUse::create(Func, Dest->getHi()));
-      return;
-    }
-    InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
-    lowerLoad(Load);
-    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
-    // Since lowerLoad may fuse the load w/ an arithmetic instruction,
-    // insert the FakeUse on the last-inserted instruction's dest.
-    Context.insert(
-        InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
-    return;
-  }
-  case Intrinsics::AtomicRMW:
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
-      Func->setError("Unexpected memory ordering for AtomicRMW");
-      return;
-    }
-    lowerAtomicRMW(
-        Instr->getDest(),
-        static_cast<uint32_t>(
-            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
-        Instr->getArg(1), Instr->getArg(2));
-    return;
-  case Intrinsics::AtomicStore: {
-    if (!Intrinsics::isMemoryOrderValid(
-            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
-      Func->setError("Unexpected memory ordering for AtomicStore");
-      return;
-    }
-    // We require the memory address to be naturally aligned.
-    // Given that is the case, then normal stores are atomic.
-    // Add a fence after the store to make it visible.
-    Operand *Value = Instr->getArg(0);
-    Operand *Ptr = Instr->getArg(1);
-    if (Value->getType() == IceType_i64) {
-      // Use a movq instead of what lowerStore() normally does
-      // (split the store into two), following what GCC does.
-      // Cast the bits from int -> to an xmm register first.
-      Variable *T = makeReg(IceType_f64);
-      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
-      lowerCast(Cast);
-      // Then store XMM w/ a movq.
-      OperandX8632Mem *Addr = formMemoryOperand(Ptr, IceType_f64);
-      _storeq(T, Addr);
-      _mfence();
-      return;
-    }
-    InstStore *Store = InstStore::create(Func, Value, Ptr);
-    lowerStore(Store);
-    _mfence();
-    return;
-  }
-  case Intrinsics::Bswap: {
-    Variable *Dest = Instr->getDest();
-    Operand *Val = Instr->getArg(0);
-    // In 32-bit mode, bswap only works on 32-bit arguments, and the
-    // argument must be a register. Use rotate left for 16-bit bswap.
-    if (Val->getType() == IceType_i64) {
-      Variable *T_Lo = legalizeToVar(loOperand(Val));
-      Variable *T_Hi = legalizeToVar(hiOperand(Val));
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      _bswap(T_Lo);
-      _bswap(T_Hi);
-      _mov(DestLo, T_Hi);
-      _mov(DestHi, T_Lo);
-    } else if (Val->getType() == IceType_i32) {
-      Variable *T = legalizeToVar(Val);
-      _bswap(T);
-      _mov(Dest, T);
-    } else {
-      assert(Val->getType() == IceType_i16);
-      Val = legalize(Val);
-      Constant *Eight = Ctx->getConstantInt16(8);
-      Variable *T = nullptr;
-      _mov(T, Val);
-      _rol(T, Eight);
-      _mov(Dest, T);
-    }
-    return;
-  }
-  case Intrinsics::Ctpop: {
-    Variable *Dest = Instr->getDest();
-    Operand *Val = Instr->getArg(0);
-    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
-                                        ? H_call_ctpop_i32
-                                        : H_call_ctpop_i64,
-                                    Dest, 1);
-    Call->addArg(Val);
-    lowerCall(Call);
-    // The popcount helpers always return 32-bit values, while the intrinsic's
-    // signature matches the native POPCNT instruction and fills a 64-bit reg
-    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
-    // the user doesn't do that in the IR. If the user does that in the IR,
-    // then this zero'ing instruction is dead and gets optimized out.
-    if (Val->getType() == IceType_i64) {
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(DestHi, Zero);
-    }
-    return;
-  }
-  case Intrinsics::Ctlz: {
-    // The "is zero undef" parameter is ignored and we always return
-    // a well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
-      FirstVal = loOperand(Val);
-      SecondVal = hiOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    const bool IsCttz = false;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Cttz: {
-    // The "is zero undef" parameter is ignored and we always return
-    // a well-defined value.
-    Operand *Val = legalize(Instr->getArg(0));
-    Operand *FirstVal;
-    Operand *SecondVal = nullptr;
-    if (Val->getType() == IceType_i64) {
-      FirstVal = hiOperand(Val);
-      SecondVal = loOperand(Val);
-    } else {
-      FirstVal = Val;
-    }
-    const bool IsCttz = true;
-    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
-                    SecondVal);
-    return;
-  }
-  case Intrinsics::Fabs: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Type Ty = Src->getType();
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeVectorOfFabsMask(Ty);
-    // The pand instruction operates on an m128 memory operand, so if
-    // Src is an f32 or f64, we need to make sure it's in a register.
-    if (isVectorType(Ty)) {
-      if (llvm::isa<OperandX8632Mem>(Src))
-        Src = legalizeToVar(Src);
-    } else {
-      Src = legalizeToVar(Src);
-    }
-    _pand(T, Src);
-    if (isVectorType(Ty))
-      _movp(Dest, T);
-    else
-      _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::Longjmp: {
-    InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Memcpy: {
-    // In the future, we could potentially emit an inline memcpy/memset, etc.
-    // for intrinsic calls w/ a known length.
-    InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Memmove: {
-    InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(Instr->getArg(1));
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Memset: {
-    // The value operand needs to be extended to a stack slot size
-    // because the PNaCl ABI requires arguments to be at least 32 bits
-    // wide.
-    Operand *ValOp = Instr->getArg(1);
-    assert(ValOp->getType() == IceType_i8);
-    Variable *ValExt = Func->makeVariable(stackSlotType());
-    lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
-    InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
-    Call->addArg(Instr->getArg(0));
-    Call->addArg(ValExt);
-    Call->addArg(Instr->getArg(2));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::NaClReadTP: {
-    if (Ctx->getFlags().getUseSandboxing()) {
-      Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      Operand *Src =
-          OperandX8632Mem::create(Func, IceType_i32, nullptr, Zero, nullptr, 0,
-                                  OperandX8632Mem::SegReg_GS);
-      Variable *Dest = Instr->getDest();
-      Variable *T = nullptr;
-      _mov(T, Src);
-      _mov(Dest, T);
-    } else {
-      InstCall *Call = makeHelperCall(H_call_read_tp, Instr->getDest(), 0);
-      lowerCall(Call);
-    }
-    return;
-  }
-  case Intrinsics::Setjmp: {
-    InstCall *Call = makeHelperCall(H_call_setjmp, Instr->getDest(), 1);
-    Call->addArg(Instr->getArg(0));
-    lowerCall(Call);
-    return;
-  }
-  case Intrinsics::Sqrt: {
-    Operand *Src = legalize(Instr->getArg(0));
-    Variable *Dest = Instr->getDest();
-    Variable *T = makeReg(Dest->getType());
-    _sqrtss(T, Src);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::Stacksave: {
-    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
-    Variable *Dest = Instr->getDest();
-    _mov(Dest, esp);
-    return;
-  }
-  case Intrinsics::Stackrestore: {
-    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
-    _mov_nonkillable(esp, Instr->getArg(0));
-    return;
-  }
-  case Intrinsics::Trap:
-    _ud2();
-    return;
-  case Intrinsics::UnknownIntrinsic:
-    Func->setError("Should not be lowering UnknownIntrinsic");
-    return;
-  }
-  return;
-}
-
-void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
-                                     Operand *Expected, Operand *Desired) {
-  if (Expected->getType() == IceType_i64) {
-    // Reserve the pre-colored registers first, before adding any more
-    // infinite-weight variables from formMemoryOperand's legalization.
-    Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
-    Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
-    _mov(T_eax, loOperand(Expected));
-    _mov(T_edx, hiOperand(Expected));
-    _mov(T_ebx, loOperand(Desired));
-    _mov(T_ecx, hiOperand(Desired));
-    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
-    const bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  Variable *T_eax = makeReg(Expected->getType(), RegX8632::Reg_eax);
-  _mov(T_eax, Expected);
-  OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
-  Variable *DesiredReg = legalizeToVar(Desired);
-  const bool Locked = true;
-  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
-  _mov(DestPrev, T_eax);
-}
-
-bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
-                                           Operand *Expected,
-                                           Operand *Desired) {
-  if (Ctx->getFlags().getOptLevel() == Opt_m1)
-    return false;
-  // Peek ahead a few instructions and see how Dest is used.
-  // It's very common to have:
-  //
-  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
-  // [%y_phi = ...] // list of phi stores
-  // %p = icmp eq i32 %x, %expected
-  // br i1 %p, label %l1, label %l2
-  //
-  // which we can optimize into:
-  //
-  // %x = <cmpxchg code>
-  // [%y_phi = ...] // list of phi stores
-  // br eq, %l1, %l2
-  InstList::iterator I = Context.getCur();
-  // I is currently the InstIntrinsicCall. Peek past that.
-  // This assumes that the atomic cmpxchg has not been lowered yet,
-  // so that the instructions seen in the scan from "Cur" is simple.
-  assert(llvm::isa<InstIntrinsicCall>(*I));
-  Inst *NextInst = Context.getNextInst(I);
-  if (!NextInst)
-    return false;
-  // There might be phi assignments right before the compare+branch, since this
-  // could be a backward branch for a loop. This placement of assignments is
-  // determined by placePhiStores().
-  std::vector<InstAssign *> PhiAssigns;
-  while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
-    if (PhiAssign->getDest() == Dest)
-      return false;
-    PhiAssigns.push_back(PhiAssign);
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-  }
-  if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
-    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
-          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
-           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
-      return false;
-    }
-    NextInst = Context.getNextInst(I);
-    if (!NextInst)
-      return false;
-    if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
-      if (!NextBr->isUnconditional() &&
-          NextCmp->getDest() == NextBr->getCondition() &&
-          NextBr->isLastUse(NextCmp->getDest())) {
-        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
-        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
-          // Lower the phi assignments now, before the branch (same placement
-          // as before).
-          InstAssign *PhiAssign = PhiAssigns[i];
-          PhiAssign->setDeleted();
-          lowerAssign(PhiAssign);
-          Context.advanceNext();
-        }
-        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
-        // Skip over the old compare and branch, by deleting them.
-        NextCmp->setDeleted();
-        NextBr->setDeleted();
-        Context.advanceNext();
-        Context.advanceNext();
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
-                                 Operand *Ptr, Operand *Val) {
-  bool NeedsCmpxchg = false;
-  LowerBinOp Op_Lo = nullptr;
-  LowerBinOp Op_Hi = nullptr;
-  switch (Operation) {
-  default:
-    Func->setError("Unknown AtomicRMW operation");
-    return;
-  case Intrinsics::AtomicAdd: {
-    if (Dest->getType() == IceType_i64) {
-      // All the fall-through paths must set this to true, but use this
-      // for asserting.
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX8632::_add;
-      Op_Hi = &TargetX8632::_adc;
-      break;
-    }
-    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    const bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicSub: {
-    if (Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      Op_Lo = &TargetX8632::_sub;
-      Op_Hi = &TargetX8632::_sbb;
-      break;
-    }
-    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    const bool Locked = true;
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _neg(T);
-    _xadd(Addr, T, Locked);
-    _mov(Dest, T);
-    return;
-  }
-  case Intrinsics::AtomicOr:
-    // TODO(jvoung): If Dest is null or dead, then some of these
-    // operations do not need an "exchange", but just a locked op.
-    // That appears to be "worth" it for sub, or, and, and xor.
-    // xadd is probably fine vs lock add for add, and xchg is fine
-    // vs an atomic store.
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX8632::_or;
-    Op_Hi = &TargetX8632::_or;
-    break;
-  case Intrinsics::AtomicAnd:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX8632::_and;
-    Op_Hi = &TargetX8632::_and;
-    break;
-  case Intrinsics::AtomicXor:
-    NeedsCmpxchg = true;
-    Op_Lo = &TargetX8632::_xor;
-    Op_Hi = &TargetX8632::_xor;
-    break;
-  case Intrinsics::AtomicExchange:
-    if (Dest->getType() == IceType_i64) {
-      NeedsCmpxchg = true;
-      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
-      // just need to be moved to the ecx and ebx registers.
-      Op_Lo = nullptr;
-      Op_Hi = nullptr;
-      break;
-    }
-    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
-    Variable *T = nullptr;
-    _mov(T, Val);
-    _xchg(Addr, T);
-    _mov(Dest, T);
-    return;
-  }
-  // Otherwise, we need a cmpxchg loop.
-  (void)NeedsCmpxchg;
-  assert(NeedsCmpxchg);
-  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
-}
-
-void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
-                                           Variable *Dest, Operand *Ptr,
-                                           Operand *Val) {
-  // Expand a more complex RMW operation as a cmpxchg loop:
-  // For 64-bit:
-  //   mov     eax, [ptr]
-  //   mov     edx, [ptr + 4]
-  // .LABEL:
-  //   mov     ebx, eax
-  //   <Op_Lo> ebx, <desired_adj_lo>
-  //   mov     ecx, edx
-  //   <Op_Hi> ecx, <desired_adj_hi>
-  //   lock cmpxchg8b [ptr]
-  //   jne     .LABEL
-  //   mov     <dest_lo>, eax
-  //   mov     <dest_lo>, edx
-  //
-  // For 32-bit:
-  //   mov     eax, [ptr]
-  // .LABEL:
-  //   mov     <reg>, eax
-  //   op      <reg>, [desired_adj]
-  //   lock cmpxchg [ptr], <reg>
-  //   jne     .LABEL
-  //   mov     <dest>, eax
-  //
-  // If Op_{Lo,Hi} are nullptr, then just copy the value.
-  Val = legalize(Val);
-  Type Ty = Val->getType();
-  if (Ty == IceType_i64) {
-    Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
-    Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
-    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
-    _mov(T_eax, loOperand(Addr));
-    _mov(T_edx, hiOperand(Addr));
-    Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
-    Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
-    InstX8632Label *Label = InstX8632Label::create(Func, this);
-    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
-    if (!IsXchg8b) {
-      Context.insert(Label);
-      _mov(T_ebx, T_eax);
-      (this->*Op_Lo)(T_ebx, loOperand(Val));
-      _mov(T_ecx, T_edx);
-      (this->*Op_Hi)(T_ecx, hiOperand(Val));
-    } else {
-      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
-      // It just needs the Val loaded into ebx and ecx.
-      // That can also be done before the loop.
-      _mov(T_ebx, loOperand(Val));
-      _mov(T_ecx, hiOperand(Val));
-      Context.insert(Label);
-    }
-    const bool Locked = true;
-    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
-    _br(CondX86::Br_ne, Label);
-    if (!IsXchg8b) {
-      // If Val is a variable, model the extended live range of Val through
-      // the end of the loop, since it will be re-used by the loop.
-      if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
-        Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
-        Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
-        Context.insert(InstFakeUse::create(Func, ValLo));
-        Context.insert(InstFakeUse::create(Func, ValHi));
-      }
-    } else {
-      // For xchg, the loop is slightly smaller and ebx/ecx are used.
-      Context.insert(InstFakeUse::create(Func, T_ebx));
-      Context.insert(InstFakeUse::create(Func, T_ecx));
-    }
-    // The address base (if any) is also reused in the loop.
-    if (Variable *Base = Addr->getBase())
-      Context.insert(InstFakeUse::create(Func, Base));
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    _mov(DestLo, T_eax);
-    _mov(DestHi, T_edx);
-    return;
-  }
-  OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
-  Variable *T_eax = makeReg(Ty, RegX8632::Reg_eax);
-  _mov(T_eax, Addr);
-  InstX8632Label *Label = InstX8632Label::create(Func, this);
-  Context.insert(Label);
-  // We want to pick a different register for T than Eax, so don't use
-  // _mov(T == nullptr, T_eax).
-  Variable *T = makeReg(Ty);
-  _mov(T, T_eax);
-  (this->*Op_Lo)(T, Val);
-  const bool Locked = true;
-  _cmpxchg(Addr, T_eax, T, Locked);
-  _br(CondX86::Br_ne, Label);
-  // If Val is a variable, model the extended live range of Val through
-  // the end of the loop, since it will be re-used by the loop.
-  if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
-    Context.insert(InstFakeUse::create(Func, ValVar));
-  }
-  // The address base (if any) is also reused in the loop.
-  if (Variable *Base = Addr->getBase())
-    Context.insert(InstFakeUse::create(Func, Base));
-  _mov(Dest, T_eax);
-}
-
-// Lowers count {trailing, leading} zeros intrinsic.
-//
-// We could do constant folding here, but that should have
-// been done by the front-end/middle-end optimizations.
-void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
-                                  Operand *FirstVal, Operand *SecondVal) {
-  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
-  // Then the instructions will handle the Val == 0 case much more simply
-  // and won't require conversion from bit position to number of zeros.
-  //
-  // Otherwise:
-  //   bsr IF_NOT_ZERO, Val
-  //   mov T_DEST, 63
-  //   cmovne T_DEST, IF_NOT_ZERO
-  //   xor T_DEST, 31
-  //   mov DEST, T_DEST
-  //
-  // NOTE: T_DEST must be a register because cmov requires its dest to be a
-  // register. Also, bsf and bsr require their dest to be a register.
-  //
-  // The xor DEST, 31 converts a bit position to # of leading zeroes.
-  // E.g., for 000... 00001100, bsr will say that the most significant bit
-  // set is at position 3, while the number of leading zeros is 28. Xor is
-  // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
-  //
-  // Similar for 64-bit, but start w/ speculating that the upper 32 bits
-  // are all zero, and compute the result for that case (checking the lower
-  // 32 bits). Then actually compute the result for the upper bits and
-  // cmov in the result from the lower computation if the earlier speculation
-  // was correct.
-  //
-  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
-  // bit position conversion, and the speculation is reversed.
-  assert(Ty == IceType_i32 || Ty == IceType_i64);
-  Variable *T = makeReg(IceType_i32);
-  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
-  if (Cttz) {
-    _bsf(T, FirstValRM);
-  } else {
-    _bsr(T, FirstValRM);
-  }
-  Variable *T_Dest = makeReg(IceType_i32);
-  Constant *ThirtyTwo = Ctx->getConstantInt32(32);
-  Constant *ThirtyOne = Ctx->getConstantInt32(31);
-  if (Cttz) {
-    _mov(T_Dest, ThirtyTwo);
-  } else {
-    Constant *SixtyThree = Ctx->getConstantInt32(63);
-    _mov(T_Dest, SixtyThree);
-  }
-  _cmov(T_Dest, T, CondX86::Br_ne);
-  if (!Cttz) {
-    _xor(T_Dest, ThirtyOne);
-  }
-  if (Ty == IceType_i32) {
-    _mov(Dest, T_Dest);
-    return;
-  }
-  _add(T_Dest, ThirtyTwo);
-  Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-  Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-  // Will be using "test" on this, so we need a registerized variable.
-  Variable *SecondVar = legalizeToVar(SecondVal);
-  Variable *T_Dest2 = makeReg(IceType_i32);
-  if (Cttz) {
-    _bsf(T_Dest2, SecondVar);
-  } else {
-    _bsr(T_Dest2, SecondVar);
-    _xor(T_Dest2, ThirtyOne);
-  }
-  _test(SecondVar, SecondVar);
-  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
-  _mov(DestLo, T_Dest2);
-  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
-}
-
-namespace {
-
-bool isAdd(const Inst *Inst) {
-  if (const InstArithmetic *Arith =
-          llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
-    return (Arith->getOp() == InstArithmetic::Add);
-  }
-  return false;
-}
-
-void dumpAddressOpt(const Cfg *Func, const Variable *Base,
-                    const Variable *Index, uint16_t Shift, int32_t Offset,
-                    const Inst *Reason) {
-  if (!ALLOW_DUMP)
-    return;
-  if (!Func->isVerbose(IceV_AddrOpt))
-    return;
-  OstreamLocker L(Func->getContext());
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "Instruction: ";
-  Reason->dumpDecorated(Func);
-  Str << "  results in Base=";
-  if (Base)
-    Base->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Index=";
-  if (Index)
-    Index->dump(Func);
-  else
-    Str << "<null>";
-  Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n";
-}
-
-bool matchTransitiveAssign(const VariablesMetadata *VMetadata, Variable *&Var,
-                           const Inst *&Reason) {
-  // Var originates from Var=SrcVar ==>
-  //   set Var:=SrcVar
-  if (Var == nullptr)
-    return false;
-  if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) {
-    assert(!VMetadata->isMultiDef(Var));
-    if (llvm::isa<InstAssign>(VarAssign)) {
-      Operand *SrcOp = VarAssign->getSrc(0);
-      assert(SrcOp);
-      if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
-        if (!VMetadata->isMultiDef(SrcVar) &&
-            // TODO: ensure SrcVar stays single-BB
-            true) {
-          Var = SrcVar;
-          Reason = VarAssign;
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable *&Base,
-                            Variable *&Index, uint16_t &Shift,
-                            const Inst *&Reason) {
-  // Index==nullptr && Base is Base=Var1+Var2 ==>
-  //   set Base=Var1, Index=Var2, Shift=0
-  if (Base == nullptr)
-    return false;
-  if (Index != nullptr)
-    return false;
-  const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
-  if (BaseInst == nullptr)
-    return false;
-  assert(!VMetadata->isMultiDef(Base));
-  if (BaseInst->getSrcSize() < 2)
-    return false;
-  if (Variable *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
-    if (VMetadata->isMultiDef(Var1))
-      return false;
-    if (Variable *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
-      if (VMetadata->isMultiDef(Var2))
-        return false;
-      if (isAdd(BaseInst) &&
-          // TODO: ensure Var1 and Var2 stay single-BB
-          true) {
-        Base = Var1;
-        Index = Var2;
-        Shift = 0; // should already have been 0
-        Reason = BaseInst;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool matchShiftedIndex(const VariablesMetadata *VMetadata, Variable *&Index,
-                       uint16_t &Shift, const Inst *&Reason) {
-  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
-  //   Index=Var, Shift+=log2(Const)
-  if (Index == nullptr)
-    return false;
-  const Inst *IndexInst = VMetadata->getSingleDefinition(Index);
-  if (IndexInst == nullptr)
-    return false;
-  assert(!VMetadata->isMultiDef(Index));
-  if (IndexInst->getSrcSize() < 2)
-    return false;
-  if (const InstArithmetic *ArithInst =
-          llvm::dyn_cast<InstArithmetic>(IndexInst)) {
-    if (Variable *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
-      if (ConstantInteger32 *Const =
-              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
-        if (ArithInst->getOp() == InstArithmetic::Mul &&
-            !VMetadata->isMultiDef(Var) && Const->getType() == IceType_i32) {
-          uint64_t Mult = Const->getValue();
-          uint32_t LogMult;
-          switch (Mult) {
-          case 1:
-            LogMult = 0;
-            break;
-          case 2:
-            LogMult = 1;
-            break;
-          case 4:
-            LogMult = 2;
-            break;
-          case 8:
-            LogMult = 3;
-            break;
-          default:
-            return false;
-          }
-          if (Shift + LogMult <= 3) {
-            Index = Var;
-            Shift += LogMult;
-            Reason = IndexInst;
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable *&Base,
-                     int32_t &Offset, const Inst *&Reason) {
-  // Base is Base=Var+Const || Base is Base=Const+Var ==>
-  //   set Base=Var, Offset+=Const
-  // Base is Base=Var-Const ==>
-  //   set Base=Var, Offset-=Const
-  if (Base == nullptr)
-    return false;
-  const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
-  if (BaseInst == nullptr)
-    return false;
-  assert(!VMetadata->isMultiDef(Base));
-  if (const InstArithmetic *ArithInst =
-          llvm::dyn_cast<const InstArithmetic>(BaseInst)) {
-    if (ArithInst->getOp() != InstArithmetic::Add &&
-        ArithInst->getOp() != InstArithmetic::Sub)
-      return false;
-    bool IsAdd = ArithInst->getOp() == InstArithmetic::Add;
-    Variable *Var = nullptr;
-    ConstantInteger32 *Const = nullptr;
-    if (Variable *VariableOperand =
-            llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
-      Var = VariableOperand;
-      Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1));
-    } else if (IsAdd) {
-      Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(0));
-      Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(1));
-    }
-    if (Var == nullptr || Const == nullptr || VMetadata->isMultiDef(Var))
-      return false;
-    int32_t MoreOffset = IsAdd ? Const->getValue() : -Const->getValue();
-    if (Utils::WouldOverflowAdd(Offset, MoreOffset))
-      return false;
-    Base = Var;
-    Offset += MoreOffset;
-    Reason = BaseInst;
-    return true;
-  }
-  return false;
-}
-
-void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
-                       Variable *&Index, uint16_t &Shift, int32_t &Offset) {
-  Func->resetCurrentNode();
-  if (Func->isVerbose(IceV_AddrOpt)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-    Str << "\nStarting computeAddressOpt for instruction:\n  ";
-    Instr->dumpDecorated(Func);
-  }
-  (void)Offset; // TODO: pattern-match for non-zero offsets.
-  if (Base == nullptr)
-    return;
-  // If the Base has more than one use or is live across multiple
-  // blocks, then don't go further.  Alternatively (?), never consider
-  // a transformation that would change a variable that is currently
-  // *not* live across basic block boundaries into one that *is*.
-  if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/)
-    return;
-
-  const VariablesMetadata *VMetadata = Func->getVMetadata();
-  bool Continue = true;
-  while (Continue) {
-    const Inst *Reason = nullptr;
-    if (matchTransitiveAssign(VMetadata, Base, Reason) ||
-        matchTransitiveAssign(VMetadata, Index, Reason) ||
-        matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) ||
-        matchShiftedIndex(VMetadata, Index, Shift, Reason) ||
-        matchOffsetBase(VMetadata, Base, Offset, Reason)) {
-      dumpAddressOpt(Func, Base, Index, Shift, Offset, Reason);
-    } else {
-      Continue = false;
-    }
-
-    // Index is Index=Var<<Const && Const+Shift<=3 ==>
-    //   Index=Var, Shift+=Const
-
-    // Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
-    //   Index=Var, Shift+=log2(Const)
-
-    // Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
-    //   swap(Index,Base)
-    // Similar for Base=Const*Var and Base=Var<<Const
-
-    // Index is Index=Var+Const ==>
-    //   set Index=Var, Offset+=(Const<<Shift)
-
-    // Index is Index=Const+Var ==>
-    //   set Index=Var, Offset+=(Const<<Shift)
-
-    // Index is Index=Var-Const ==>
-    //   set Index=Var, Offset-=(Const<<Shift)
-
-    // TODO: consider overflow issues with respect to Offset.
-    // TODO: handle symbolic constants.
-  }
-}
-
-} // anonymous namespace
-
-void TargetX8632::lowerLoad(const InstLoad *Load) {
-  // A Load instruction can be treated the same as an Assign
-  // instruction, after the source operand is transformed into an
-  // OperandX8632Mem operand.  Note that the address mode
-  // optimization already creates an OperandX8632Mem operand, so it
-  // doesn't need another level of transformation.
-  Variable *DestLoad = Load->getDest();
-  Type Ty = DestLoad->getType();
-  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
-  InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
-  lowerAssign(Assign);
-}
-
-void TargetX8632::doAddressOptLoad() {
-  Inst *Inst = Context.getCur();
-  Variable *Dest = Inst->getDest();
-  Operand *Addr = Inst->getSrc(0);
-  Variable *Index = nullptr;
-  uint16_t Shift = 0;
-  int32_t Offset = 0; // TODO: make Constant
-  // Vanilla ICE load instructions should not use the segment registers,
-  // and computeAddressOpt only works at the level of Variables and Constants,
-  // not other OperandX8632Mem, so there should be no mention of segment
-  // registers there either.
-  const OperandX8632Mem::SegmentRegisters SegmentReg =
-      OperandX8632Mem::DefaultSegment;
-  Variable *Base = llvm::dyn_cast<Variable>(Addr);
-  computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
-  if (Base && Addr != Base) {
-    Inst->setDeleted();
-    Constant *OffsetOp = Ctx->getConstantInt32(Offset);
-    Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
-                                   Shift, SegmentReg);
-    Context.insert(InstLoad::create(Func, Dest, Addr));
-  }
-}
-
-void TargetX8632::randomlyInsertNop(float Probability) {
-  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
-  if (RNG.getTrueWithProbability(Probability)) {
-    _nop(RNG(X86_NUM_NOP_VARIANTS));
-  }
-}
-
-void TargetX8632::lowerPhi(const InstPhi * /*Inst*/) {
-  Func->setError("Phi found in regular instruction list");
-}
-
-void TargetX8632::lowerRet(const InstRet *Inst) {
-  Variable *Reg = nullptr;
-  if (Inst->hasRetValue()) {
-    Operand *Src0 = legalize(Inst->getRetValue());
-    if (Src0->getType() == IceType_i64) {
-      Variable *eax = legalizeToVar(loOperand(Src0), RegX8632::Reg_eax);
-      Variable *edx = legalizeToVar(hiOperand(Src0), RegX8632::Reg_edx);
-      Reg = eax;
-      Context.insert(InstFakeUse::create(Func, edx));
-    } else if (isScalarFloatingType(Src0->getType())) {
-      _fld(Src0);
-    } else if (isVectorType(Src0->getType())) {
-      Reg = legalizeToVar(Src0, RegX8632::Reg_xmm0);
-    } else {
-      _mov(Reg, Src0, RegX8632::Reg_eax);
-    }
-  }
-  // Add a ret instruction even if sandboxing is enabled, because
-  // addEpilog explicitly looks for a ret instruction as a marker for
-  // where to insert the frame removal instructions.
-  _ret(Reg);
-  // Add a fake use of esp to make sure esp stays alive for the entire
-  // function.  Otherwise post-call esp adjustments get dead-code
-  // eliminated.  TODO: Are there more places where the fake use
-  // should be inserted?  E.g. "void f(int n){while(1) g(n);}" may not
-  // have a ret instruction.
-  Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
-  Context.insert(InstFakeUse::create(Func, esp));
-}
-
-void TargetX8632::lowerSelect(const InstSelect *Inst) {
-  Variable *Dest = Inst->getDest();
-  Type DestTy = Dest->getType();
-  Operand *SrcT = Inst->getTrueOperand();
-  Operand *SrcF = Inst->getFalseOperand();
-  Operand *Condition = Inst->getCondition();
-
-  if (isVectorType(DestTy)) {
-    Type SrcTy = SrcT->getType();
-    Variable *T = makeReg(SrcTy);
-    Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
-    Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
-    if (InstructionSet >= SSE4_1) {
-      // TODO(wala): If the condition operand is a constant, use blendps
-      // or pblendw.
-      //
-      // Use blendvps or pblendvb to implement select.
-      if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
-          SrcTy == IceType_v4f32) {
-        Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-        Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
-        _movp(xmm0, ConditionRM);
-        _psll(xmm0, Ctx->getConstantInt8(31));
-        _movp(T, SrcFRM);
-        _blendvps(T, SrcTRM, xmm0);
-        _movp(Dest, T);
-      } else {
-        assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
-        Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
-                                                              : IceType_v16i8;
-        Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
-        lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
-        _movp(T, SrcFRM);
-        _pblendvb(T, SrcTRM, xmm0);
-        _movp(Dest, T);
-      }
-      return;
-    }
-    // Lower select without SSE4.1:
-    // a=d?b:c ==>
-    //   if elementtype(d) != i1:
-    //      d=sext(d);
-    //   a=(b&d)|(c&~d);
-    Variable *T2 = makeReg(SrcTy);
-    // Sign extend the condition operand if applicable.
-    if (SrcTy == IceType_v4f32) {
-      // The sext operation takes only integer arguments.
-      Variable *T3 = Func->makeVariable(IceType_v4i32);
-      lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
-      _movp(T, T3);
-    } else if (typeElementType(SrcTy) != IceType_i1) {
-      lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
-    } else {
-      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
-      _movp(T, ConditionRM);
-    }
-    _movp(T2, T);
-    _pand(T, SrcTRM);
-    _pandn(T2, SrcFRM);
-    _por(T, T2);
-    _movp(Dest, T);
-
-    return;
-  }
-
-  CondX86::BrCond Cond = CondX86::Br_ne;
-  Operand *CmpOpnd0 = nullptr;
-  Operand *CmpOpnd1 = nullptr;
-  // Handle folding opportunities.
-  if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
-    assert(Producer->isDeleted());
-    switch (BoolFolding::getProducerKind(Producer)) {
-    default:
-      break;
-    case BoolFolding::PK_Icmp32: {
-      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
-      Cond = getIcmp32Mapping(Cmp->getCondition());
-      CmpOpnd1 = legalize(Producer->getSrc(1));
-      CmpOpnd0 = legalizeSrc0ForCmp(Producer->getSrc(0), CmpOpnd1);
-    } break;
-    }
-  }
-  if (CmpOpnd0 == nullptr) {
-    CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem);
-    CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
-  }
-  assert(CmpOpnd0);
-  assert(CmpOpnd1);
-
-  _cmp(CmpOpnd0, CmpOpnd1);
-  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
-    // The cmov instruction doesn't allow 8-bit or FP operands, so
-    // we need explicit control flow.
-    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
-    InstX8632Label *Label = InstX8632Label::create(Func, this);
-    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
-    _mov(Dest, SrcT);
-    _br(Cond, Label);
-    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
-    _mov_nonkillable(Dest, SrcF);
-    Context.insert(Label);
-    return;
-  }
-  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
-  // But if SrcT is immediate, we might be able to do better, as
-  // the cmov instruction doesn't allow an immediate operand:
-  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
-  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
-    std::swap(SrcT, SrcF);
-    Cond = InstX8632::getOppositeCondition(Cond);
-  }
-  if (DestTy == IceType_i64) {
-    // Set the low portion.
-    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-    Variable *TLo = nullptr;
-    Operand *SrcFLo = legalize(loOperand(SrcF));
-    _mov(TLo, SrcFLo);
-    Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Mem);
-    _cmov(TLo, SrcTLo, Cond);
-    _mov(DestLo, TLo);
-    // Set the high portion.
-    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-    Variable *THi = nullptr;
-    Operand *SrcFHi = legalize(hiOperand(SrcF));
-    _mov(THi, SrcFHi);
-    Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Mem);
-    _cmov(THi, SrcTHi, Cond);
-    _mov(DestHi, THi);
-    return;
-  }
-
-  assert(DestTy == IceType_i16 || DestTy == IceType_i32);
-  Variable *T = nullptr;
-  SrcF = legalize(SrcF);
-  _mov(T, SrcF);
-  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
-  _cmov(T, SrcT, Cond);
-  _mov(Dest, T);
-}
-
-void TargetX8632::lowerStore(const InstStore *Inst) {
-  Operand *Value = Inst->getData();
-  Operand *Addr = Inst->getAddr();
-  OperandX8632Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
-  Type Ty = NewAddr->getType();
-
-  if (Ty == IceType_i64) {
-    Value = legalize(Value);
-    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
-    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
-    _store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));
-    _store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));
-  } else if (isVectorType(Ty)) {
-    _storep(legalizeToVar(Value), NewAddr);
-  } else {
-    Value = legalize(Value, Legal_Reg | Legal_Imm);
-    _store(Value, NewAddr);
-  }
-}
-
-void TargetX8632::doAddressOptStore() {
-  InstStore *Inst = llvm::cast<InstStore>(Context.getCur());
-  Operand *Data = Inst->getData();
-  Operand *Addr = Inst->getAddr();
-  Variable *Index = nullptr;
-  uint16_t Shift = 0;
-  int32_t Offset = 0; // TODO: make Constant
-  Variable *Base = llvm::dyn_cast<Variable>(Addr);
-  // Vanilla ICE store instructions should not use the segment registers,
-  // and computeAddressOpt only works at the level of Variables and Constants,
-  // not other OperandX8632Mem, so there should be no mention of segment
-  // registers there either.
-  const OperandX8632Mem::SegmentRegisters SegmentReg =
-      OperandX8632Mem::DefaultSegment;
-  computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
-  if (Base && Addr != Base) {
-    Inst->setDeleted();
-    Constant *OffsetOp = Ctx->getConstantInt32(Offset);
-    Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
-                                   Shift, SegmentReg);
-    InstStore *NewStore = InstStore::create(Func, Data, Addr);
-    if (Inst->getDest())
-      NewStore->setRmwBeacon(Inst->getRmwBeacon());
-    Context.insert(NewStore);
-  }
-}
-
-void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
-  // This implements the most naive possible lowering.
-  // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
-  Operand *Src0 = Inst->getComparison();
-  SizeT NumCases = Inst->getNumCases();
-  if (Src0->getType() == IceType_i64) {
-    Src0 = legalize(Src0); // get Base/Index into physical registers
-    Operand *Src0Lo = loOperand(Src0);
-    Operand *Src0Hi = hiOperand(Src0);
-    if (NumCases >= 2) {
-      Src0Lo = legalizeToVar(Src0Lo);
-      Src0Hi = legalizeToVar(Src0Hi);
-    } else {
-      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
-      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
-    }
-    for (SizeT I = 0; I < NumCases; ++I) {
-      Constant *ValueLo = Ctx->getConstantInt32(Inst->getValue(I));
-      Constant *ValueHi = Ctx->getConstantInt32(Inst->getValue(I) >> 32);
-      InstX8632Label *Label = InstX8632Label::create(Func, this);
-      _cmp(Src0Lo, ValueLo);
-      _br(CondX86::Br_ne, Label);
-      _cmp(Src0Hi, ValueHi);
-      _br(CondX86::Br_e, Inst->getLabel(I));
-      Context.insert(Label);
-    }
-    _br(Inst->getLabelDefault());
-    return;
-  }
-  // OK, we'll be slightly less naive by forcing Src into a physical
-  // register if there are 2 or more uses.
-  if (NumCases >= 2)
-    Src0 = legalizeToVar(Src0);
-  else
-    Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
-  for (SizeT I = 0; I < NumCases; ++I) {
-    Constant *Value = Ctx->getConstantInt32(Inst->getValue(I));
-    _cmp(Src0, Value);
-    _br(CondX86::Br_e, Inst->getLabel(I));
-  }
-
-  _br(Inst->getLabelDefault());
-}
-
-void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
-                                      Variable *Dest, Operand *Src0,
-                                      Operand *Src1) {
-  assert(isVectorType(Dest->getType()));
-  Type Ty = Dest->getType();
-  Type ElementTy = typeElementType(Ty);
-  SizeT NumElements = typeNumElements(Ty);
-
-  Operand *T = Ctx->getConstantUndef(Ty);
-  for (SizeT I = 0; I < NumElements; ++I) {
-    Constant *Index = Ctx->getConstantInt32(I);
-
-    // Extract the next two inputs.
-    Variable *Op0 = Func->makeVariable(ElementTy);
-    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
-    Variable *Op1 = Func->makeVariable(ElementTy);
-    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
-
-    // Perform the arithmetic as a scalar operation.
-    Variable *Res = Func->makeVariable(ElementTy);
-    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
-
-    // Insert the result into position.
-    Variable *DestT = Func->makeVariable(Ty);
-    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
-    T = DestT;
-  }
-
-  lowerAssign(InstAssign::create(Func, Dest, T));
-}
-
-// The following pattern occurs often in lowered C and C++ code:
-//
-//   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
-//   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-//
-// We can eliminate the sext operation by copying the result of pcmpeqd,
-// pcmpgtd, or cmpps (which produce sign extended results) to the result
-// of the sext operation.
-void TargetX8632::eliminateNextVectorSextInstruction(
-    Variable *SignExtendedResult) {
-  if (InstCast *NextCast =
-          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-    if (NextCast->getCastKind() == InstCast::Sext &&
-        NextCast->getSrc(0) == SignExtendedResult) {
-      NextCast->setDeleted();
-      _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
-      // Skip over the instruction.
-      Context.advanceNext();
-    }
-  }
-}
-
-void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { _ud2(); }
-
-void TargetX8632::lowerRMW(const InstX8632FakeRMW *RMW) {
-  // If the beacon variable's live range does not end in this
-  // instruction, then it must end in the modified Store instruction
-  // that follows.  This means that the original Store instruction is
-  // still there, either because the value being stored is used beyond
-  // the Store instruction, or because dead code elimination did not
-  // happen.  In either case, we cancel RMW lowering (and the caller
-  // deletes the RMW instruction).
-  if (!RMW->isLastUse(RMW->getBeacon()))
-    return;
-  Operand *Src = RMW->getData();
-  Type Ty = Src->getType();
-  OperandX8632Mem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
-  if (Ty == IceType_i64) {
-    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
-    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
-    OperandX8632Mem *AddrLo = llvm::cast<OperandX8632Mem>(loOperand(Addr));
-    OperandX8632Mem *AddrHi = llvm::cast<OperandX8632Mem>(hiOperand(Addr));
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      _add_rmw(AddrLo, SrcLo);
-      _adc_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Sub:
-      _sub_rmw(AddrLo, SrcLo);
-      _sbb_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::And:
-      _and_rmw(AddrLo, SrcLo);
-      _and_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Or:
-      _or_rmw(AddrLo, SrcLo);
-      _or_rmw(AddrHi, SrcHi);
-      return;
-    case InstArithmetic::Xor:
-      _xor_rmw(AddrLo, SrcLo);
-      _xor_rmw(AddrHi, SrcHi);
-      return;
-    }
-  } else {
-    // i8, i16, i32
-    switch (RMW->getOp()) {
-    default:
-      // TODO(stichnot): Implement other arithmetic operators.
-      break;
-    case InstArithmetic::Add:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _add_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Sub:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _sub_rmw(Addr, Src);
-      return;
-    case InstArithmetic::And:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _and_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Or:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _or_rmw(Addr, Src);
-      return;
-    case InstArithmetic::Xor:
-      Src = legalize(Src, Legal_Reg | Legal_Imm);
-      _xor_rmw(Addr, Src);
-      return;
-    }
-  }
-  llvm::report_fatal_error("Couldn't lower RMW instruction");
-}
-
-void TargetX8632::lowerOther(const Inst *Instr) {
-  if (const auto *RMW = llvm::dyn_cast<InstX8632FakeRMW>(Instr)) {
-    lowerRMW(RMW);
-  } else {
-    TargetLowering::lowerOther(Instr);
-  }
-}
-
-// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
-// preserve integrity of liveness analysis.  Undef values are also
-// turned into zeroes, since loOperand() and hiOperand() don't expect
-// Undef input.
-void TargetX8632::prelowerPhis() {
-  // Pause constant blinding or pooling, blinding or pooling will be done later
-  // during phi lowering assignments
-  BoolFlagSaver B(RandomizationPoolingPaused, true);
-
-  CfgNode *Node = Context.getNode();
-  for (Inst &I : Node->getPhis()) {
-    auto Phi = llvm::dyn_cast<InstPhi>(&I);
-    if (Phi->isDeleted())
-      continue;
-    Variable *Dest = Phi->getDest();
-    if (Dest->getType() == IceType_i64) {
-      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
-      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
-      InstPhi *PhiLo = InstPhi::create(Func, Phi->getSrcSize(), DestLo);
-      InstPhi *PhiHi = InstPhi::create(Func, Phi->getSrcSize(), DestHi);
-      for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
-        Operand *Src = Phi->getSrc(I);
-        CfgNode *Label = Phi->getLabel(I);
-        if (llvm::isa<ConstantUndef>(Src))
-          Src = Ctx->getConstantZero(Dest->getType());
-        PhiLo->addArgument(loOperand(Src), Label);
-        PhiHi->addArgument(hiOperand(Src), Label);
-      }
-      Node->getPhis().push_back(PhiLo);
-      Node->getPhis().push_back(PhiHi);
-      Phi->setDeleted();
-    }
-  }
-}
-
-namespace {
-
-bool isMemoryOperand(const Operand *Opnd) {
-  if (const auto Var = llvm::dyn_cast<Variable>(Opnd))
-    return !Var->hasReg();
-  // We treat vector undef values the same as a memory operand,
-  // because they do in fact need a register to materialize the vector
-  // of zeroes into.
-  if (llvm::isa<ConstantUndef>(Opnd))
-    return isScalarFloatingType(Opnd->getType()) ||
-           isVectorType(Opnd->getType());
-  if (llvm::isa<Constant>(Opnd))
-    return isScalarFloatingType(Opnd->getType());
-  return true;
-}
-
-} // end of anonymous namespace
-
-// Lower the pre-ordered list of assignments into mov instructions.
-// Also has to do some ad-hoc register allocation as necessary.
-void TargetX8632::lowerPhiAssignments(CfgNode *Node,
-                                      const AssignList &Assignments) {
-  // Check that this is a properly initialized shell of a node.
-  assert(Node->getOutEdges().size() == 1);
-  assert(Node->getInsts().empty());
-  assert(Node->getPhis().empty());
-  CfgNode *Succ = Node->getOutEdges().front();
-  getContext().init(Node);
-  // Register set setup similar to regAlloc().
-  RegSetMask RegInclude = RegSet_All;
-  RegSetMask RegExclude = RegSet_StackPointer;
-  if (hasFramePointer())
-    RegExclude |= RegSet_FramePointer;
-  llvm::SmallBitVector Available = getRegisterSet(RegInclude, RegExclude);
-  bool NeedsRegs = false;
-  // Initialize the set of available registers to the set of what is
-  // available (not live) at the beginning of the successor block,
-  // minus all registers used as Dest operands in the Assignments.  To
-  // do this, we start off assuming all registers are available, then
-  // iterate through the Assignments and remove Dest registers.
-  // During this iteration, we also determine whether we will actually
-  // need any extra registers for memory-to-memory copies.  If so, we
-  // do the actual work of removing the live-in registers from the
-  // set.  TODO(stichnot): This work is being repeated for every split
-  // edge to the successor, so consider updating LiveIn just once
-  // after all the edges are split.
-  for (const Inst &I : Assignments) {
-    Variable *Dest = I.getDest();
-    if (Dest->hasReg()) {
-      Available[Dest->getRegNum()] = false;
-    } else if (isMemoryOperand(I.getSrc(0))) {
-      NeedsRegs = true; // Src and Dest are both in memory
-    }
-  }
-  if (NeedsRegs) {
-    LivenessBV &LiveIn = Func->getLiveness()->getLiveIn(Succ);
-    for (int i = LiveIn.find_first(); i != -1; i = LiveIn.find_next(i)) {
-      Variable *Var = Func->getLiveness()->getVariable(i, Succ);
-      if (Var->hasReg())
-        Available[Var->getRegNum()] = false;
-    }
-  }
-  // Iterate backwards through the Assignments.  After lowering each
-  // assignment, add Dest to the set of available registers, and
-  // remove Src from the set of available registers.  Iteration is
-  // done backwards to enable incremental updates of the available
-  // register set, and the lowered instruction numbers may be out of
-  // order, but that can be worked around by renumbering the block
-  // afterwards if necessary.
-  for (const Inst &I : reverse_range(Assignments)) {
-    Context.rewind();
-    auto Assign = llvm::dyn_cast<InstAssign>(&I);
-    Variable *Dest = Assign->getDest();
-
-    // If the source operand is ConstantUndef, do not legalize it.
-    // In function test_split_undef_int_vec, the advanced phi
-    // lowering process will find an assignment of undefined
-    // vector. This vector, as the Src here, will crash if it
-    // go through legalize(). legalize() will create new variable
-    // with makeVectorOfZeros(), but this new variable will be
-    // assigned a stack slot. This will fail the assertion in
-    // IceInstX8632.cpp:789, as XmmEmitterRegOp() complain:
-    // Var->hasReg() fails. Note this failure is irrelevant to
-    // randomization or pooling of constants.
-    // So, we do not call legalize() to add pool label for the
-    // src operands of phi assignment instructions.
-    // Instead, we manually add pool label for constant float and
-    // constant double values here.
-    // Note going through legalize() does not affect the testing
-    // results of SPEC2K and xtests.
-    Operand *Src = Assign->getSrc(0);
-    if (!llvm::isa<ConstantUndef>(Assign->getSrc(0))) {
-      Src = legalize(Src);
-    }
-
-    Variable *SrcVar = llvm::dyn_cast<Variable>(Src);
-    // Use normal assignment lowering, except lower mem=mem specially
-    // so we can register-allocate at the same time.
-    if (!isMemoryOperand(Dest) || !isMemoryOperand(Src)) {
-      lowerAssign(Assign);
-    } else {
-      assert(Dest->getType() == Src->getType());
-      const llvm::SmallBitVector &RegsForType =
-          getRegisterSetForType(Dest->getType());
-      llvm::SmallBitVector AvailRegsForType = RegsForType & Available;
-      Variable *SpillLoc = nullptr;
-      Variable *Preg = nullptr;
-      // TODO(stichnot): Opportunity for register randomization.
-      int32_t RegNum = AvailRegsForType.find_first();
-      bool IsVector = isVectorType(Dest->getType());
-      bool NeedSpill = (RegNum == -1);
-      if (NeedSpill) {
-        // Pick some register to spill and update RegNum.
-        // TODO(stichnot): Opportunity for register randomization.
-        RegNum = RegsForType.find_first();
-        Preg = getPhysicalRegister(RegNum, Dest->getType());
-        SpillLoc = Func->makeVariable(Dest->getType());
-        // Create a fake def of the physical register to avoid
-        // liveness inconsistency problems during late-stage liveness
-        // analysis (e.g. asm-verbose mode).
-        Context.insert(InstFakeDef::create(Func, Preg));
-        if (IsVector)
-          _movp(SpillLoc, Preg);
-        else
-          _mov(SpillLoc, Preg);
-      }
-      assert(RegNum >= 0);
-      if (llvm::isa<ConstantUndef>(Src))
-        // Materialize an actual constant instead of undef.  RegNum is
-        // passed in for vector types because undef vectors are
-        // lowered to vector register of zeroes.
-        Src =
-            legalize(Src, Legal_All, IsVector ? RegNum : Variable::NoRegister);
-      Variable *Tmp = makeReg(Dest->getType(), RegNum);
-      if (IsVector) {
-        _movp(Tmp, Src);
-        _movp(Dest, Tmp);
-      } else {
-        _mov(Tmp, Src);
-        _mov(Dest, Tmp);
-      }
-      if (NeedSpill) {
-        // Restore the spilled register.
-        if (IsVector)
-          _movp(Preg, SpillLoc);
-        else
-          _mov(Preg, SpillLoc);
-        // Create a fake use of the physical register to keep it live
-        // for late-stage liveness analysis (e.g. asm-verbose mode).
-        Context.insert(InstFakeUse::create(Func, Preg));
-      }
-    }
-    // Update register availability before moving to the previous
-    // instruction on the Assignments list.
-    if (Dest->hasReg())
-      Available[Dest->getRegNum()] = true;
-    if (SrcVar && SrcVar->hasReg())
-      Available[SrcVar->getRegNum()] = false;
-  }
-
-  // Add the terminator branch instruction to the end.
-  Context.setInsertPoint(Context.getEnd());
-  _br(Succ);
-}
-
-// There is no support for loading or emitting vector constants, so the
-// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
-// etc. are initialized with register operations.
-//
-// TODO(wala): Add limited support for vector constants so that
-// complex initialization in registers is unnecessary.
-
-Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
-  Variable *Reg = makeReg(Ty, RegNum);
-  // Insert a FakeDef, since otherwise the live range of Reg might
-  // be overestimated.
-  Context.insert(InstFakeDef::create(Func, Reg));
-  _pxor(Reg, Reg);
-  return Reg;
-}
-
-Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
-  Variable *MinusOnes = makeReg(Ty, RegNum);
-  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
-  Context.insert(InstFakeDef::create(Func, MinusOnes));
-  _pcmpeq(MinusOnes, MinusOnes);
-  return MinusOnes;
-}
-
-Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
-  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
-  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
-  _psub(Dest, MinusOne);
-  return Dest;
-}
-
-Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
-  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
-         Ty == IceType_v16i8);
-  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
-    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
-    SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
-    _psll(Reg, Ctx->getConstantInt8(Shift));
-    return Reg;
-  } else {
-    // SSE has no left shift operation for vectors of 8 bit integers.
-    const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
-    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
-    Variable *Reg = makeReg(Ty, RegNum);
-    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
-    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
-    return Reg;
-  }
-}
-
-// Construct a mask in a register that can be and'ed with a
-// floating-point value to mask off its sign bit.  The value will be
-// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
-// for f64.  Construct it as vector of ones logically right shifted
-// one bit.  TODO(stichnot): Fix the wala TODO above, to represent
-// vector constants in memory.
-Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, int32_t RegNum) {
-  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
-  _psrl(Reg, Ctx->getConstantInt8(1));
-  return Reg;
-}
-
-OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
-                                                           Variable *Slot,
-                                                           uint32_t Offset) {
-  // Ensure that Loc is a stack slot.
-  assert(Slot->getWeight().isZero());
-  assert(Slot->getRegNum() == Variable::NoRegister);
-  // Compute the location of Loc in memory.
-  // TODO(wala,stichnot): lea should not be required.  The address of
-  // the stack slot is known at compile time (although not until after
-  // addProlog()).
-  const Type PointerType = IceType_i32;
-  Variable *Loc = makeReg(PointerType);
-  _lea(Loc, Slot);
-  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
-  return OperandX8632Mem::create(Func, Ty, Loc, ConstantOffset);
-}
-
-// Helper for legalize() to emit the right code to lower an operand to a
-// register of the appropriate type.
-Variable *TargetX8632::copyToReg(Operand *Src, int32_t RegNum) {
-  Type Ty = Src->getType();
-  Variable *Reg = makeReg(Ty, RegNum);
-  if (isVectorType(Ty)) {
-    _movp(Reg, Src);
-  } else {
-    _mov(Reg, Src);
-  }
-  return Reg;
-}
-
-Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
-                               int32_t RegNum) {
-  Type Ty = From->getType();
-  // Assert that a physical register is allowed.  To date, all calls
-  // to legalize() allow a physical register.  If a physical register
-  // needs to be explicitly disallowed, then new code will need to be
-  // written to force a spill.
-  assert(Allowed & Legal_Reg);
-  // If we're asking for a specific physical register, make sure we're
-  // not allowing any other operand kinds.  (This could be future
-  // work, e.g. allow the shl shift amount to be either an immediate
-  // or in ecx.)
-  assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg);
-
-  if (auto Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
-    // Before doing anything with a Mem operand, we need to ensure
-    // that the Base and Index components are in physical registers.
-    Variable *Base = Mem->getBase();
-    Variable *Index = Mem->getIndex();
-    Variable *RegBase = nullptr;
-    Variable *RegIndex = nullptr;
-    if (Base) {
-      RegBase = legalizeToVar(Base);
-    }
-    if (Index) {
-      RegIndex = legalizeToVar(Index);
-    }
-    if (Base != RegBase || Index != RegIndex) {
-      Mem =
-          OperandX8632Mem::create(Func, Ty, RegBase, Mem->getOffset(), RegIndex,
-                                  Mem->getShift(), Mem->getSegmentRegister());
-    }
-
-    // For all Memory Operands, we do randomization/pooling here
-    From = randomizeOrPoolImmediate(Mem);
-
-    if (!(Allowed & Legal_Mem)) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
-    if (llvm::isa<ConstantUndef>(Const)) {
-      // Lower undefs to zero.  Another option is to lower undefs to an
-      // uninitialized register; however, using an uninitialized register
-      // results in less predictable code.
-      //
-      // If in the future the implementation is changed to lower undef
-      // values to uninitialized registers, a FakeDef will be needed:
-      //     Context.insert(InstFakeDef::create(Func, Reg));
-      // This is in order to ensure that the live range of Reg is not
-      // overestimated.  If the constant being lowered is a 64 bit value,
-      // then the result should be split and the lo and hi components will
-      // need to go in uninitialized registers.
-      if (isVectorType(Ty))
-        return makeVectorOfZeros(Ty, RegNum);
-      Const = Ctx->getConstantZero(Ty);
-      From = Const;
-    }
-    // There should be no constants of vector type (other than undef).
-    assert(!isVectorType(Ty));
-
-    // If the operand is an 32 bit constant integer, we should check
-    // whether we need to randomize it or pool it.
-    if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
-      Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
-      if (NewConst != Const) {
-        return NewConst;
-      }
-    }
-
-    // Convert a scalar floating point constant into an explicit
-    // memory operand.
-    if (isScalarFloatingType(Ty)) {
-      Variable *Base = nullptr;
-      std::string Buffer;
-      llvm::raw_string_ostream StrBuf(Buffer);
-      llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
-      llvm::cast<Constant>(From)->setShouldBePooled(true);
-      Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
-      From = OperandX8632Mem::create(Func, Ty, Base, Offset);
-    }
-    bool NeedsReg = false;
-    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
-      // Immediate specifically not allowed
-      NeedsReg = true;
-    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
-      // On x86, FP constants are lowered to mem operands.
-      NeedsReg = true;
-    if (NeedsReg) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-  if (auto Var = llvm::dyn_cast<Variable>(From)) {
-    // Check if the variable is guaranteed a physical register.  This
-    // can happen either when the variable is pre-colored or when it is
-    // assigned infinite weight.
-    bool MustHaveRegister = (Var->hasReg() || Var->getWeight().isInf());
-    // We need a new physical register for the operand if:
-    //   Mem is not allowed and Var isn't guaranteed a physical
-    //   register, or
-    //   RegNum is required and Var->getRegNum() doesn't match.
-    if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
-        (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
-      From = copyToReg(From, RegNum);
-    }
-    return From;
-  }
-  llvm_unreachable("Unhandled operand kind in legalize()");
-  return From;
-}
-
-// Provide a trivial wrapper to legalize() for this common usage.
-Variable *TargetX8632::legalizeToVar(Operand *From, int32_t RegNum) {
-  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
-}
-
-// For the cmp instruction, if Src1 is an immediate, or known to be a
-// physical register, we can allow Src0 to be a memory operand.
-// Otherwise, Src0 must be copied into a physical register.
-// (Actually, either Src0 or Src1 can be chosen for the physical
-// register, but unfortunately we have to commit to one or the other
-// before register allocation.)
-Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
-  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
-}
-
-OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Opnd, Type Ty,
-                                                bool DoLegalize) {
-  OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Opnd);
-  // It may be the case that address mode optimization already creates
-  // an OperandX8632Mem, so in that case it wouldn't need another level
-  // of transformation.
-  if (!Mem) {
-    Variable *Base = llvm::dyn_cast<Variable>(Opnd);
-    Constant *Offset = llvm::dyn_cast<Constant>(Opnd);
-    assert(Base || Offset);
-    if (Offset) {
-      // During memory operand building, we do not blind or pool
-      // the constant offset, we will work on the whole memory
-      // operand later as one entity later, this save one instruction.
-      // By turning blinding and pooling off, we guarantee
-      // legalize(Offset) will return a constant*.
-      {
-        BoolFlagSaver B(RandomizationPoolingPaused, true);
-
-        Offset = llvm::cast<Constant>(legalize(Offset));
-      }
-
-      assert(llvm::isa<ConstantInteger32>(Offset) ||
-             llvm::isa<ConstantRelocatable>(Offset));
-    }
-    Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
-  }
-  // Do legalization, which contains randomization/pooling
-  // or do randomization/pooling.
-  return llvm::cast<OperandX8632Mem>(
-      DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem));
-}
-
-Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
-  // There aren't any 64-bit integer registers for x86-32.
-  assert(Type != IceType_i64);
-  Variable *Reg = Func->makeVariable(Type);
-  if (RegNum == Variable::NoRegister)
-    Reg->setWeightInfinite();
-  else
-    Reg->setRegNum(RegNum);
-  return Reg;
-}
-
-void TargetX8632::postLower() {
-  if (Ctx->getFlags().getOptLevel() == Opt_m1)
-    return;
-  inferTwoAddress();
-}
-
-void TargetX8632::makeRandomRegisterPermutation(
-    llvm::SmallVectorImpl<int32_t> &Permutation,
-    const llvm::SmallBitVector &ExcludeRegisters) const {
-  // TODO(stichnot): Declaring Permutation this way loses type/size
-  // information.  Fix this in conjunction with the caller-side TODO.
-  assert(Permutation.size() >= RegX8632::Reg_NUM);
-  // Expected upper bound on the number of registers in a single
-  // equivalence class.  For x86-32, this would comprise the 8 XMM
-  // registers.  This is for performance, not correctness.
-  static const unsigned MaxEquivalenceClassSize = 8;
-  typedef llvm::SmallVector<int32_t, MaxEquivalenceClassSize> RegisterList;
-  typedef std::map<uint32_t, RegisterList> EquivalenceClassMap;
-  EquivalenceClassMap EquivalenceClasses;
-  SizeT NumShuffled = 0, NumPreserved = 0;
-
-// Build up the equivalence classes of registers by looking at the
-// register properties as well as whether the registers should be
-// explicitly excluded from shuffling.
-#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
-          frameptr, isI8, isInt, isFP)                                         \
-  if (ExcludeRegisters[RegX8632::val]) {                                       \
-    /* val stays the same in the resulting permutation. */                     \
-    Permutation[RegX8632::val] = RegX8632::val;                                \
-    ++NumPreserved;                                                            \
-  } else {                                                                     \
-    const uint32_t Index = (scratch << 0) | (preserved << 1) | (isI8 << 2) |   \
-                           (isInt << 3) | (isFP << 4);                         \
-    /* val is assigned to an equivalence class based on its properties. */     \
-    EquivalenceClasses[Index].push_back(RegX8632::val);                        \
-  }
-  REGX8632_TABLE
-#undef X
-
-  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
-
-  // Shuffle the resulting equivalence classes.
-  for (auto I : EquivalenceClasses) {
-    const RegisterList &List = I.second;
-    RegisterList Shuffled(List);
-    RandomShuffle(Shuffled.begin(), Shuffled.end(), RNG);
-    for (size_t SI = 0, SE = Shuffled.size(); SI < SE; ++SI) {
-      Permutation[List[SI]] = Shuffled[SI];
-      ++NumShuffled;
-    }
-  }
-
-  assert(NumShuffled + NumPreserved == RegX8632::Reg_NUM);
-
-  if (Func->isVerbose(IceV_Random)) {
-    OstreamLocker L(Func->getContext());
-    Ostream &Str = Func->getContext()->getStrDump();
-    Str << "Register equivalence classes:\n";
-    for (auto I : EquivalenceClasses) {
-      Str << "{";
-      const RegisterList &List = I.second;
-      bool First = true;
-      for (int32_t Register : List) {
-        if (!First)
-          Str << " ";
-        First = false;
-        Str << getRegName(Register, IceType_i32);
-      }
-      Str << "}\n";
-    }
-  }
-}
-
-void TargetX8632::emit(const ConstantInteger32 *C) const {
-  if (!ALLOW_DUMP)
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << getConstantPrefix() << C->getValue();
-}
-
-void TargetX8632::emit(const ConstantInteger64 *) const {
-  llvm::report_fatal_error("Not expecting to emit 64-bit integers");
-}
-
-void TargetX8632::emit(const ConstantFloat *C) const {
-  if (!ALLOW_DUMP)
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  C->emitPoolLabel(Str);
-}
-
-void TargetX8632::emit(const ConstantDouble *C) const {
-  if (!ALLOW_DUMP)
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  C->emitPoolLabel(Str);
-}
-
-void TargetX8632::emit(const ConstantUndef *) const {
-  llvm::report_fatal_error("undef value encountered by emitter.");
-}
-
-TargetDataX8632::TargetDataX8632(GlobalContext *Ctx)
-    : TargetDataLowering(Ctx) {}
-
-void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
-                                   const IceString &SectionSuffix) {
-  switch (Ctx->getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_386_32, SectionSuffix);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
-    OstreamLocker L(Ctx);
-    for (const VariableDeclaration *Var : Vars) {
-      if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
-        emitGlobal(*Var, SectionSuffix);
-      }
-    }
-  } break;
-  }
-}
-
-template <typename T> struct PoolTypeConverter {};
-
-template <> struct PoolTypeConverter<float> {
-  typedef uint32_t PrimitiveIntType;
-  typedef ConstantFloat IceType;
-  static const Type Ty = IceType_f32;
-  static const char *TypeName;
-  static const char *AsmTag;
-  static const char *PrintfString;
-};
-const char *PoolTypeConverter<float>::TypeName = "float";
-const char *PoolTypeConverter<float>::AsmTag = ".long";
-const char *PoolTypeConverter<float>::PrintfString = "0x%x";
-
-template <> struct PoolTypeConverter<double> {
-  typedef uint64_t PrimitiveIntType;
-  typedef ConstantDouble IceType;
-  static const Type Ty = IceType_f64;
-  static const char *TypeName;
-  static const char *AsmTag;
-  static const char *PrintfString;
-};
-const char *PoolTypeConverter<double>::TypeName = "double";
-const char *PoolTypeConverter<double>::AsmTag = ".quad";
-const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
-
-// Add converter for int type constant pooling
-template <> struct PoolTypeConverter<uint32_t> {
-  typedef uint32_t PrimitiveIntType;
-  typedef ConstantInteger32 IceType;
-  static const Type Ty = IceType_i32;
-  static const char *TypeName;
-  static const char *AsmTag;
-  static const char *PrintfString;
-};
-const char *PoolTypeConverter<uint32_t>::TypeName = "i32";
-const char *PoolTypeConverter<uint32_t>::AsmTag = ".long";
-const char *PoolTypeConverter<uint32_t>::PrintfString = "0x%x";
-
-// Add converter for int type constant pooling
-template <> struct PoolTypeConverter<uint16_t> {
-  typedef uint32_t PrimitiveIntType;
-  typedef ConstantInteger32 IceType;
-  static const Type Ty = IceType_i16;
-  static const char *TypeName;
-  static const char *AsmTag;
-  static const char *PrintfString;
-};
-const char *PoolTypeConverter<uint16_t>::TypeName = "i16";
-const char *PoolTypeConverter<uint16_t>::AsmTag = ".short";
-const char *PoolTypeConverter<uint16_t>::PrintfString = "0x%x";
-
-// Add converter for int type constant pooling
-template <> struct PoolTypeConverter<uint8_t> {
-  typedef uint32_t PrimitiveIntType;
-  typedef ConstantInteger32 IceType;
-  static const Type Ty = IceType_i8;
-  static const char *TypeName;
-  static const char *AsmTag;
-  static const char *PrintfString;
-};
-const char *PoolTypeConverter<uint8_t>::TypeName = "i8";
-const char *PoolTypeConverter<uint8_t>::AsmTag = ".byte";
-const char *PoolTypeConverter<uint8_t>::PrintfString = "0x%x";
-
-template <typename T>
-void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
-  if (!ALLOW_DUMP)
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Type Ty = T::Ty;
-  SizeT Align = typeAlignInBytes(Ty);
-  ConstantList Pool = Ctx->getConstantPool(Ty);
-
-  Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
-      << "\n";
-  Str << "\t.align\t" << Align << "\n";
-  for (Constant *C : Pool) {
-    if (!C->getShouldBePooled())
-      continue;
-    typename T::IceType *Const = llvm::cast<typename T::IceType>(C);
-    typename T::IceType::PrimType Value = Const->getValue();
-    // Use memcpy() to copy bits from Value into RawValue in a way
-    // that avoids breaking strict-aliasing rules.
-    typename T::PrimitiveIntType RawValue;
-    memcpy(&RawValue, &Value, sizeof(Value));
-    char buf[30];
-    int CharsPrinted =
-        snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
-    assert(CharsPrinted >= 0 &&
-           (size_t)CharsPrinted < llvm::array_lengthof(buf));
-    (void)CharsPrinted; // avoid warnings if asserts are disabled
-    Const->emitPoolLabel(Str);
-    Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t# " << T::TypeName << " "
-        << Value << "\n";
-  }
-}
-
-void TargetDataX8632::lowerConstants() {
-  if (Ctx->getFlags().getDisableTranslation())
-    return;
-  // No need to emit constants from the int pool since (for x86) they
-  // are embedded as immediates in the instructions, just emit float/double.
-  switch (Ctx->getFlags().getOutFileType()) {
-  case FT_Elf: {
-    ELFObjectWriter *Writer = Ctx->getObjectWriter();
-
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
-    Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
-
-    Writer->writeConstantPool<ConstantFloat>(IceType_f32);
-    Writer->writeConstantPool<ConstantDouble>(IceType_f64);
-  } break;
-  case FT_Asm:
-  case FT_Iasm: {
-    OstreamLocker L(Ctx);
-
-    emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
-    emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
-
-    emitConstantPool<PoolTypeConverter<float>>(Ctx);
-    emitConstantPool<PoolTypeConverter<double>>(Ctx);
-  } break;
-  }
-}
-
-TargetHeaderX8632::TargetHeaderX8632(GlobalContext *Ctx)
-    : TargetHeaderLowering(Ctx) {}
-
-// Randomize or pool an Immediate.
-Operand *TargetX8632::randomizeOrPoolImmediate(Constant *Immediate,
-                                               int32_t RegNum) {
-  assert(llvm::isa<ConstantInteger32>(Immediate) ||
-         llvm::isa<ConstantRelocatable>(Immediate));
-  if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
-      RandomizationPoolingPaused == true) {
-    // Immediates randomization/pooling off or paused
-    return Immediate;
-  }
-  if (Immediate->shouldBeRandomizedOrPooled(Ctx)) {
-    Ctx->statsUpdateRPImms();
-    if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
-        RPI_Randomize) {
-      // blind the constant
-      // FROM:
-      //  imm
-      // TO:
-      //  insert: mov imm+cookie, Reg
-      //  insert: lea -cookie[Reg], Reg
-      //  => Reg
-      // If we have already assigned a phy register, we must come from
-      // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse
-      // the assigned register as this assignment is that start of its use-def
-      // chain. So we add RegNum argument here.
-      // Note we use 'lea' instruction instead of 'xor' to avoid affecting
-      // the flags.
-      Variable *Reg = makeReg(IceType_i32, RegNum);
-      ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate);
-      uint32_t Value = Integer->getValue();
-      uint32_t Cookie = Ctx->getRandomizationCookie();
-      _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
-      Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
-      _lea(Reg,
-           OperandX8632Mem::create(Func, IceType_i32, Reg, Offset, nullptr, 0));
-      // make sure liveness analysis won't kill this variable, otherwise a
-      // liveness
-      // assertion will be triggered.
-      _set_dest_nonkillable();
-      if (Immediate->getType() != IceType_i32) {
-        Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
-        _mov(TruncReg, Reg);
-        return TruncReg;
-      }
-      return Reg;
-    }
-    if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) {
-      // pool the constant
-      // FROM:
-      //  imm
-      // TO:
-      //  insert: mov $label, Reg
-      //  => Reg
-      assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
-      Immediate->setShouldBePooled(true);
-      // if we have already assigned a phy register, we must come from
-      // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse
-      // the assigned register as this assignment is that start of its use-def
-      // chain. So we add RegNum argument here.
-      Variable *Reg = makeReg(Immediate->getType(), RegNum);
-      IceString Label;
-      llvm::raw_string_ostream Label_stream(Label);
-      Immediate->emitPoolLabel(Label_stream);
-      const RelocOffsetT Offset = 0;
-      const bool SuppressMangling = true;
-      Constant *Symbol =
-          Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling);
-      OperandX8632Mem *MemOperand =
-          OperandX8632Mem::create(Func, Immediate->getType(), nullptr, Symbol);
-      _mov(Reg, MemOperand);
-      return Reg;
-    }
-    assert("Unsupported -randomize-pool-immediates option" && false);
-  }
-  // the constant Immediate is not eligible for blinding/pooling
-  return Immediate;
-}
-
-OperandX8632Mem *
-TargetX8632::randomizeOrPoolImmediate(OperandX8632Mem *MemOperand,
-                                      int32_t RegNum) {
-  assert(MemOperand);
-  if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
-      RandomizationPoolingPaused == true) {
-    // immediates randomization/pooling is turned off
-    return MemOperand;
-  }
-
-  // If this memory operand is already a randommized one, we do
-  // not randomize it again.
-  if (MemOperand->getRandomized())
-    return MemOperand;
-
-  if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) {
-    if (C->shouldBeRandomizedOrPooled(Ctx)) {
-      // The offset of this mem operand should be blinded or pooled
-      Ctx->statsUpdateRPImms();
-      if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
-          RPI_Randomize) {
-        // blind the constant offset
-        // FROM:
-        //  offset[base, index, shift]
-        // TO:
-        //  insert: lea offset+cookie[base], RegTemp
-        //  => -cookie[RegTemp, index, shift]
-        uint32_t Value =
-            llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())
-                ->getValue();
-        uint32_t Cookie = Ctx->getRandomizationCookie();
-        Constant *Mask1 = Ctx->getConstantInt(
-            MemOperand->getOffset()->getType(), Cookie + Value);
-        Constant *Mask2 =
-            Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
-
-        OperandX8632Mem *TempMemOperand = OperandX8632Mem::create(
-            Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
-        // If we have already assigned a physical register, we must come from
-        // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
-        // the assigned register as this assignment is that start of its use-def
-        // chain. So we add RegNum argument here.
-        Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
-        _lea(RegTemp, TempMemOperand);
-        // As source operand doesn't use the dstreg, we don't need to add
-        // _set_dest_nonkillable().
-        // But if we use the same Dest Reg, that is, with RegNum
-        // assigned, we should add this _set_dest_nonkillable()
-        if (RegNum != Variable::NoRegister)
-          _set_dest_nonkillable();
-
-        OperandX8632Mem *NewMemOperand = OperandX8632Mem::create(
-            Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
-            MemOperand->getShift(), MemOperand->getSegmentRegister());
-
-        // Label this memory operand as randomize, so we won't randomize it
-        // again in case we call legalize() mutiple times on this memory
-        // operand.
-        NewMemOperand->setRandomized(true);
-        return NewMemOperand;
-      }
-      if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) {
-        // pool the constant offset
-        // FROM:
-        //  offset[base, index, shift]
-        // TO:
-        //  insert: mov $label, RegTemp
-        //  insert: lea [base, RegTemp], RegTemp
-        //  =>[RegTemp, index, shift]
-        assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
-               RPI_Pool);
-        // Memory operand should never exist as source operands in phi
-        // lowering assignments, so there is no need to reuse any registers
-        // here. For phi lowering, we should not ask for new physical
-        // registers in general.
-        // However, if we do meet Memory Operand during phi lowering, we
-        // should not blind or pool the immediates for now.
-        if (RegNum != Variable::NoRegister)
-          return MemOperand;
-        Variable *RegTemp = makeReg(IceType_i32);
-        IceString Label;
-        llvm::raw_string_ostream Label_stream(Label);
-        MemOperand->getOffset()->emitPoolLabel(Label_stream);
-        MemOperand->getOffset()->setShouldBePooled(true);
-        const RelocOffsetT SymOffset = 0;
-        bool SuppressMangling = true;
-        Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(),
-                                               SuppressMangling);
-        OperandX8632Mem *SymbolOperand = OperandX8632Mem::create(
-            Func, MemOperand->getOffset()->getType(), nullptr, Symbol);
-        _mov(RegTemp, SymbolOperand);
-        // If we have a base variable here, we should add the lea instruction
-        // to add the value of the base variable to RegTemp. If there is no
-        // base variable, we won't need this lea instruction.
-        if (MemOperand->getBase()) {
-          OperandX8632Mem *CalculateOperand = OperandX8632Mem::create(
-              Func, MemOperand->getType(), MemOperand->getBase(), nullptr,
-              RegTemp, 0, MemOperand->getSegmentRegister());
-          _lea(RegTemp, CalculateOperand);
-          _set_dest_nonkillable();
-        }
-        OperandX8632Mem *NewMemOperand = OperandX8632Mem::create(
-            Func, MemOperand->getType(), RegTemp, nullptr,
-            MemOperand->getIndex(), MemOperand->getShift(),
-            MemOperand->getSegmentRegister());
-        return NewMemOperand;
-      }
-      assert("Unsupported -randomize-pool-immediates option" && false);
-    }
-  }
-  // the offset is not eligible for blinding or pooling, return the original
-  // mem operand
-  return MemOperand;
-}
-
 } // end of namespace Ice

diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 6fce261..30f6e6e 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h

@@ -16,136 +16,20 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632_H
 
-#include <unordered_map>
-
 #include "IceAssemblerX8632.h"
 #include "IceDefs.h"
-#include "IceInst.h"
 #include "IceInstX8632.h"
 #include "IceRegistersX8632.h"
 #include "IceTargetLowering.h"
 
 namespace Ice {
 
-class BoolFoldingEntry {
-  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
-
-public:
-  BoolFoldingEntry() = default;
-  explicit BoolFoldingEntry(Inst *I);
-  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
-  // Instr is the instruction producing the i1-type variable of interest.
-  Inst *Instr = nullptr;
-  // IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
-  bool IsComplex = false;
-  // IsLiveOut is initialized conservatively to true, and is set to false when
-  // we encounter an instruction that ends Var's live range.  We disable the
-  // folding optimization when Var is live beyond this basic block.  Note that
-  // if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
-  // always be true and the folding optimization will never be performed.
-  bool IsLiveOut = true;
-  // NumUses counts the number of times Var is used as a source operand in the
-  // basic block.  If IsComplex is true and there is more than one use of Var,
-  // then the folding optimization is disabled for Var.
-  uint32_t NumUses = 0;
-};
-
-class BoolFolding {
-public:
-  enum BoolFoldingProducerKind {
-    PK_None,
-    PK_Icmp32,
-    PK_Icmp64,
-    PK_Fcmp,
-    PK_Trunc
-  };
-
-  // Currently the actual enum values are not used (other than CK_None), but we
-  // go
-  // ahead and produce them anyway for symmetry with the
-  // BoolFoldingProducerKind.
-  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
-
-private:
-  BoolFolding(const BoolFolding &) = delete;
-  BoolFolding &operator=(const BoolFolding &) = delete;
-
-public:
-  BoolFolding() = default;
-  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
-  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
-  static bool hasComplexLowering(const Inst *Instr);
-  void init(CfgNode *Node);
-  const Inst *getProducerFor(const Operand *Opnd) const;
-  void dump(const Cfg *Func) const;
-
-private:
-  // Returns true if Producers contains a valid entry for the given VarNum.
-  bool containsValid(SizeT VarNum) const {
-    auto Element = Producers.find(VarNum);
-    return Element != Producers.end() && Element->second.Instr != nullptr;
-  }
-  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
-  // Producers maps Variable::Number to a BoolFoldingEntry.
-  std::unordered_map<SizeT, BoolFoldingEntry> Producers;
-};
-
 class TargetX8632 : public TargetLowering {
   TargetX8632() = delete;
   TargetX8632(const TargetX8632 &) = delete;
   TargetX8632 &operator=(const TargetX8632 &) = delete;
 
 public:
-  static TargetX8632 *create(Cfg *Func) { return new TargetX8632(Func); }
-
-  void translateOm1() override;
-  void translateO2() override;
-  void doLoadOpt();
-  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
-
-  SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
-  Variable *getPhysicalRegister(SizeT RegNum, Type Ty = IceType_void) override;
-  IceString getRegName(SizeT RegNum, Type Ty) const override;
-  llvm::SmallBitVector getRegisterSet(RegSetMask Include,
-                                      RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
-  }
-  bool hasFramePointer() const override { return IsEbpBasedFrame; }
-  SizeT getFrameOrStackReg() const override {
-    return IsEbpBasedFrame ? RegX8632::Reg_ebp : RegX8632::Reg_esp;
-  }
-  size_t typeWidthInBytesOnStack(Type Ty) const override {
-    // Round up to the next multiple of 4 bytes.  In particular, i1,
-    // i8, and i16 are rounded up to 4 bytes.
-    return (typeWidthInBytes(Ty) + 3) & ~3;
-  }
-
-  void emitVariable(const Variable *Var) const override;
-
-  const char *getConstantPrefix() const final { return "$"; }
-  void emit(const ConstantUndef *C) const final;
-  void emit(const ConstantInteger32 *C) const final;
-  void emit(const ConstantInteger64 *C) const final;
-  void emit(const ConstantFloat *C) const final;
-  void emit(const ConstantDouble *C) const final;
-
-  void lowerArguments() override;
-  void initNodeForLowering(CfgNode *Node) override;
-  void addProlog(CfgNode *Node) override;
-  void addEpilog(CfgNode *Node) override;
-  // Ensure that a 64-bit Variable has been split into 2 32-bit
-  // Variables, creating them if necessary.  This is needed for all
-  // I64 operations, and it is needed for pushing F64 arguments for
-  // function calls using the 32-bit push instruction (though the
-  // latter could be done by directly writing to the stack).
-  void split64(Variable *Var);
-  Operand *loOperand(Operand *Operand);
-  Operand *hiOperand(Operand *Operand);
-  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
-  X8632::Address stackVarToAsmOperand(const Variable *Var) const;
-
   enum X86InstructionSet {
     Begin,
     // SSE2 is the PNaCl baseline instruction set.
@@ -154,458 +38,12 @@
     End
   };
 
-  X86InstructionSet getInstructionSet() const { return InstructionSet; }
+  static TargetX8632 *create(Cfg *Func);
+  virtual X8632::Address stackVarToAsmOperand(const Variable *Var) const = 0;
+  virtual X86InstructionSet getInstructionSet() const = 0;
 
 protected:
-  explicit TargetX8632(Cfg *Func);
-
-  void postLower() override;
-
-  void lowerAlloca(const InstAlloca *Inst) override;
-  void lowerArithmetic(const InstArithmetic *Inst) override;
-  void lowerAssign(const InstAssign *Inst) override;
-  void lowerBr(const InstBr *Inst) override;
-  void lowerCall(const InstCall *Inst) override;
-  void lowerCast(const InstCast *Inst) override;
-  void lowerExtractElement(const InstExtractElement *Inst) override;
-  void lowerFcmp(const InstFcmp *Inst) override;
-  void lowerIcmp(const InstIcmp *Inst) override;
-  void lowerIntrinsicCall(const InstIntrinsicCall *Inst) override;
-  void lowerInsertElement(const InstInsertElement *Inst) override;
-  void lowerLoad(const InstLoad *Inst) override;
-  void lowerPhi(const InstPhi *Inst) override;
-  void lowerRet(const InstRet *Inst) override;
-  void lowerSelect(const InstSelect *Inst) override;
-  void lowerStore(const InstStore *Inst) override;
-  void lowerSwitch(const InstSwitch *Inst) override;
-  void lowerUnreachable(const InstUnreachable *Inst) override;
-  void lowerOther(const Inst *Instr) override;
-  void lowerRMW(const InstX8632FakeRMW *RMW);
-  void prelowerPhis() override;
-  void lowerPhiAssignments(CfgNode *Node,
-                           const AssignList &Assignments) override;
-  void doAddressOptLoad() override;
-  void doAddressOptStore() override;
-  void randomlyInsertNop(float Probability) override;
-
-  // Naive lowering of cmpxchg.
-  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
-                          Operand *Desired);
-  // Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
-  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
-                                Operand *Expected, Operand *Desired);
-  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
-                      Operand *Val);
-  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
-                       Operand *SecondVal);
-
-  typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
-  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
-                                Variable *Dest, Operand *Ptr, Operand *Val);
-
-  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
-
-  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
-                           Operand *Src0, Operand *Src1);
-
-  // Operand legalization helpers.  To deal with address mode
-  // constraints, the helpers will create a new Operand and emit
-  // instructions that guarantee that the Operand kind is one of those
-  // indicated by the LegalMask (a bitmask of allowed kinds).  If the
-  // input Operand is known to already meet the constraints, it may be
-  // simply returned as the result, without creating any new
-  // instructions or operands.
-  enum OperandLegalization {
-    Legal_None = 0,
-    Legal_Reg = 1 << 0, // physical register, not stack location
-    Legal_Imm = 1 << 1,
-    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
-    Legal_All = ~Legal_None
-  };
-  typedef uint32_t LegalMask;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
-                    int32_t RegNum = Variable::NoRegister);
-  Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
-  // Legalize the first source operand for use in the cmp instruction.
-  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
-  // Turn a pointer operand into a memory operand that can be
-  // used by a real load/store operation. Legalizes the operand as well.
-  // This is a nop if the operand is already a legal memory operand.
-  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
-                                     bool DoLegalize = true);
-
-  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
-  static Type stackSlotType();
-
-  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
-
-  // Returns a vector in a register with the given constant entries.
-  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
-  Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
-  Variable *makeVectorOfMinusOnes(Type Ty,
-                                  int32_t RegNum = Variable::NoRegister);
-  Variable *makeVectorOfHighOrderBits(Type Ty,
-                                      int32_t RegNum = Variable::NoRegister);
-  Variable *makeVectorOfFabsMask(Type Ty,
-                                 int32_t RegNum = Variable::NoRegister);
-
-  // Return a memory operand corresponding to a stack allocated Variable.
-  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
-                                                uint32_t Offset = 0);
-
-  void makeRandomRegisterPermutation(
-      llvm::SmallVectorImpl<int32_t> &Permutation,
-      const llvm::SmallBitVector &ExcludeRegisters) const override;
-
-  // The following are helpers that insert lowered x86 instructions
-  // with minimal syntactic overhead, so that the lowering code can
-  // look as close to assembly as practical.
-  void _adc(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Adc::create(Func, Dest, Src0));
-  }
-  void _adc_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632AdcRMW::create(Func, DestSrc0, Src1));
-  }
-  void _add(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Add::create(Func, Dest, Src0));
-  }
-  void _add_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632AddRMW::create(Func, DestSrc0, Src1));
-  }
-  void _adjust_stack(int32_t Amount) {
-    Context.insert(InstX8632AdjustStack::create(
-        Func, Amount, getPhysicalRegister(RegX8632::Reg_esp)));
-  }
-  void _addps(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Addps::create(Func, Dest, Src0));
-  }
-  void _addss(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Addss::create(Func, Dest, Src0));
-  }
-  void _and(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632And::create(Func, Dest, Src0));
-  }
-  void _and_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632AndRMW::create(Func, DestSrc0, Src1));
-  }
-  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Blendvps::create(Func, Dest, Src0, Src1));
-  }
-  void _br(CondX86::BrCond Condition, CfgNode *TargetTrue,
-           CfgNode *TargetFalse) {
-    Context.insert(
-        InstX8632Br::create(Func, TargetTrue, TargetFalse, Condition));
-  }
-  void _br(CfgNode *Target) {
-    Context.insert(InstX8632Br::create(Func, Target));
-  }
-  void _br(CondX86::BrCond Condition, CfgNode *Target) {
-    Context.insert(InstX8632Br::create(Func, Target, Condition));
-  }
-  void _br(CondX86::BrCond Condition, InstX8632Label *Label) {
-    Context.insert(InstX8632Br::create(Func, Label, Condition));
-  }
-  void _bsf(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Bsf::create(Func, Dest, Src0));
-  }
-  void _bsr(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Bsr::create(Func, Dest, Src0));
-  }
-  void _bswap(Variable *SrcDest) {
-    Context.insert(InstX8632Bswap::create(Func, SrcDest));
-  }
-  void _cbwdq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
-  }
-  void _cmov(Variable *Dest, Operand *Src0, CondX86::BrCond Condition) {
-    Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));
-  }
-  void _cmp(Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
-  }
-  void _cmpps(Variable *Dest, Operand *Src0, CondX86::CmppsCond Condition) {
-    Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
-  }
-  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
-                bool Locked) {
-    Context.insert(
-        InstX8632Cmpxchg::create(Func, DestOrAddr, Eax, Desired, Locked));
-    // Mark eax as possibly modified by cmpxchg.
-    Context.insert(
-        InstFakeDef::create(Func, Eax, llvm::dyn_cast<Variable>(DestOrAddr)));
-    _set_dest_nonkillable();
-    Context.insert(InstFakeUse::create(Func, Eax));
-  }
-  void _cmpxchg8b(OperandX8632Mem *Addr, Variable *Edx, Variable *Eax,
-                  Variable *Ecx, Variable *Ebx, bool Locked) {
-    Context.insert(
-        InstX8632Cmpxchg8b::create(Func, Addr, Edx, Eax, Ecx, Ebx, Locked));
-    // Mark edx, and eax as possibly modified by cmpxchg8b.
-    Context.insert(InstFakeDef::create(Func, Edx));
-    _set_dest_nonkillable();
-    Context.insert(InstFakeUse::create(Func, Edx));
-    Context.insert(InstFakeDef::create(Func, Eax));
-    _set_dest_nonkillable();
-    Context.insert(InstFakeUse::create(Func, Eax));
-  }
-  void _cvt(Variable *Dest, Operand *Src0, InstX8632Cvt::CvtVariant Variant) {
-    Context.insert(InstX8632Cvt::create(Func, Dest, Src0, Variant));
-  }
-  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Div::create(Func, Dest, Src0, Src1));
-  }
-  void _divps(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Divps::create(Func, Dest, Src0));
-  }
-  void _divss(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Divss::create(Func, Dest, Src0));
-  }
-  void _fld(Operand *Src0) { Context.insert(InstX8632Fld::create(Func, Src0)); }
-  void _fstp(Variable *Dest) {
-    Context.insert(InstX8632Fstp::create(Func, Dest));
-  }
-  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Idiv::create(Func, Dest, Src0, Src1));
-  }
-  void _imul(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Imul::create(Func, Dest, Src0));
-  }
-  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
-  }
-  void _jmp(Operand *Target) {
-    Context.insert(InstX8632Jmp::create(Func, Target));
-  }
-  void _lea(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Lea::create(Func, Dest, Src0));
-  }
-  void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
-  // If Dest=nullptr is passed in, then a new variable is created,
-  // marked as infinite register allocation weight, and returned
-  // through the in/out Dest argument.
-  void _mov(Variable *&Dest, Operand *Src0,
-            int32_t RegNum = Variable::NoRegister) {
-    if (Dest == nullptr)
-      Dest = makeReg(Src0->getType(), RegNum);
-    Context.insert(InstX8632Mov::create(Func, Dest, Src0));
-  }
-  void _mov_nonkillable(Variable *Dest, Operand *Src0) {
-    Inst *NewInst = InstX8632Mov::create(Func, Dest, Src0);
-    NewInst->setDestNonKillable();
-    Context.insert(NewInst);
-  }
-  void _movd(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Movd::create(Func, Dest, Src0));
-  }
-  void _movp(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Movp::create(Func, Dest, Src0));
-  }
-  void _movq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
-  }
-  void _movss(Variable *Dest, Variable *Src0) {
-    Context.insert(InstX8632MovssRegs::create(Func, Dest, Src0));
-  }
-  void _movsx(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
-  }
-  void _movzx(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Movzx::create(Func, Dest, Src0));
-  }
-  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstX8632Mul::create(Func, Dest, Src0, Src1));
-  }
-  void _mulps(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Mulps::create(Func, Dest, Src0));
-  }
-  void _mulss(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Mulss::create(Func, Dest, Src0));
-  }
-  void _neg(Variable *SrcDest) {
-    Context.insert(InstX8632Neg::create(Func, SrcDest));
-  }
-  void _nop(SizeT Variant) {
-    Context.insert(InstX8632Nop::create(Func, Variant));
-  }
-  void _or(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Or::create(Func, Dest, Src0));
-  }
-  void _or_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632OrRMW::create(Func, DestSrc0, Src1));
-  }
-  void _padd(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Padd::create(Func, Dest, Src0));
-  }
-  void _pand(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pand::create(Func, Dest, Src0));
-  }
-  void _pandn(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
-  }
-  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pblendvb::create(Func, Dest, Src0, Src1));
-  }
-  void _pcmpeq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
-  }
-  void _pcmpgt(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
-  }
-  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pextr::create(Func, Dest, Src0, Src1));
-  }
-  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pinsr::create(Func, Dest, Src0, Src1));
-  }
-  void _pmull(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pmull::create(Func, Dest, Src0));
-  }
-  void _pmuludq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0));
-  }
-  void _pop(Variable *Dest) {
-    Context.insert(InstX8632Pop::create(Func, Dest));
-  }
-  void _por(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Por::create(Func, Dest, Src0));
-  }
-  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Pshufd::create(Func, Dest, Src0, Src1));
-  }
-  void _psll(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Psll::create(Func, Dest, Src0));
-  }
-  void _psra(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Psra::create(Func, Dest, Src0));
-  }
-  void _psrl(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Psrl::create(Func, Dest, Src0));
-  }
-  void _psub(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Psub::create(Func, Dest, Src0));
-  }
-  void _push(Variable *Src0) {
-    Context.insert(InstX8632Push::create(Func, Src0));
-  }
-  void _pxor(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Pxor::create(Func, Dest, Src0));
-  }
-  void _ret(Variable *Src0 = nullptr) {
-    Context.insert(InstX8632Ret::create(Func, Src0));
-  }
-  void _rol(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Rol::create(Func, Dest, Src0));
-  }
-  void _sar(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Sar::create(Func, Dest, Src0));
-  }
-  void _sbb(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Sbb::create(Func, Dest, Src0));
-  }
-  void _sbb_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632SbbRMW::create(Func, DestSrc0, Src1));
-  }
-  void _setcc(Variable *Dest, CondX86::BrCond Condition) {
-    Context.insert(InstX8632Setcc::create(Func, Dest, Condition));
-  }
-  void _shl(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Shl::create(Func, Dest, Src0));
-  }
-  void _shld(Variable *Dest, Variable *Src0, Variable *Src1) {
-    Context.insert(InstX8632Shld::create(Func, Dest, Src0, Src1));
-  }
-  void _shr(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Shr::create(Func, Dest, Src0));
-  }
-  void _shrd(Variable *Dest, Variable *Src0, Variable *Src1) {
-    Context.insert(InstX8632Shrd::create(Func, Dest, Src0, Src1));
-  }
-  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Shufps::create(Func, Dest, Src0, Src1));
-  }
-  void _sqrtss(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Sqrtss::create(Func, Dest, Src0));
-  }
-  void _store(Operand *Value, OperandX8632 *Mem) {
-    Context.insert(InstX8632Store::create(Func, Value, Mem));
-  }
-  void _storep(Variable *Value, OperandX8632Mem *Mem) {
-    Context.insert(InstX8632StoreP::create(Func, Value, Mem));
-  }
-  void _storeq(Variable *Value, OperandX8632Mem *Mem) {
-    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
-  }
-  void _sub(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Sub::create(Func, Dest, Src0));
-  }
-  void _sub_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632SubRMW::create(Func, DestSrc0, Src1));
-  }
-  void _subps(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Subps::create(Func, Dest, Src0));
-  }
-  void _subss(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Subss::create(Func, Dest, Src0));
-  }
-  void _test(Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Test::create(Func, Src0, Src1));
-  }
-  void _ucomiss(Operand *Src0, Operand *Src1) {
-    Context.insert(InstX8632Ucomiss::create(Func, Src0, Src1));
-  }
-  void _ud2() { Context.insert(InstX8632UD2::create(Func)); }
-  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
-    Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
-    // The xadd exchanges Dest and Src (modifying Src).
-    // Model that update with a FakeDef followed by a FakeUse.
-    Context.insert(
-        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
-    _set_dest_nonkillable();
-    Context.insert(InstFakeUse::create(Func, Src));
-  }
-  void _xchg(Operand *Dest, Variable *Src) {
-    Context.insert(InstX8632Xchg::create(Func, Dest, Src));
-    // The xchg modifies Dest and Src -- model that update with a
-    // FakeDef/FakeUse.
-    Context.insert(
-        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
-    _set_dest_nonkillable();
-    Context.insert(InstFakeUse::create(Func, Src));
-  }
-  void _xor(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Xor::create(Func, Dest, Src0));
-  }
-  void _xor_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
-    Context.insert(InstX8632XorRMW::create(Func, DestSrc0, Src1));
-  }
-  void _set_dest_nonkillable() {
-    Context.getLastInserted()->setDestNonKillable();
-  }
-
-  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
-  void findRMW();
-
-  X86InstructionSet InstructionSet = X86InstructionSet::Begin;
-  bool IsEbpBasedFrame = false;
-  bool NeedsStackAlignment = false;
-  size_t SpillAreaSizeBytes = 0;
-  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
-  llvm::SmallBitVector ScratchRegs;
-  llvm::SmallBitVector RegsUsed;
-  VarList PhysicalRegisters[IceType_NUM];
-  static IceString RegNames[];
-
-  // Randomize a given immediate operand
-  Operand *randomizeOrPoolImmediate(Constant *Immediate,
-                                    int32_t RegNum = Variable::NoRegister);
-  OperandX8632Mem *
-  randomizeOrPoolImmediate(OperandX8632Mem *MemOperand,
-                           int32_t RegNum = Variable::NoRegister);
-  bool RandomizationPoolingPaused = false;
-
-private:
-  ~TargetX8632() override {}
-  BoolFolding FoldingInfo;
+  explicit TargetX8632(Cfg *Func) : TargetLowering(Func) {}
 };
 
 class TargetDataX8632 final : public TargetDataLowering {
@@ -626,7 +64,7 @@
   explicit TargetDataX8632(GlobalContext *Ctx);
 
 private:
-  ~TargetDataX8632() override {}
+  ~TargetDataX8632() override = default;
   template <typename T> static void emitConstantPool(GlobalContext *Ctx);
 };
 

diff --git a/src/IceTargetLoweringX8664.h b/src/IceTargetLoweringX8664.h
new file mode 100644
index 0000000..90daf8c
--- /dev/null
+++ b/src/IceTargetLoweringX8664.h

@@ -0,0 +1,58 @@
+//===- subzero/src/IceTargetLoweringX8664.h - x86-64 lowering ---*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetLoweringX8664 class, which
+// implements the TargetLowering interface for the x86-64
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8664_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX8664_H
+
+#include "IceDefs.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+
+class TargetX8664 : public TargetLowering {
+  TargetX8664() = delete;
+  TargetX8664(const TargetX8664 &) = delete;
+  TargetX8664 &operator=(const TargetX8664 &) = delete;
+
+public:
+  static TargetX8664 *create(Cfg *) {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+};
+
+class TargetDataX8664 : public TargetDataLowering {
+  TargetDataX8664() = delete;
+  TargetDataX8664(const TargetDataX8664 &) = delete;
+  TargetDataX8664 &operator=(const TargetDataX8664 &) = delete;
+
+public:
+  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+};
+
+class TargetHeaderX8664 : public TargetHeaderLowering {
+  TargetHeaderX8664() = delete;
+  TargetHeaderX8664(const TargetHeaderX8664 &) = delete;
+  TargetHeaderX8664 &operator=(const TargetHeaderX8664 &) = delete;
+
+public:
+  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
+    llvm::report_fatal_error("Not yet implemented");
+  }
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX8664_H

diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
new file mode 100644
index 0000000..a7c4275
--- /dev/null
+++ b/src/IceTargetLoweringX86Base.h

@@ -0,0 +1,610 @@
+//===- subzero/src/IceTargetLoweringX86Base.h - x86 lowering ----*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetLoweringX86 template class, which
+// implements the TargetLowering base interface for the x86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASE_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86BASE_H
+
+#include <unordered_map>
+
+#include "IceDefs.h"
+#include "IceInst.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+namespace X86Internal {
+
+template <class MachineTraits> class BoolFolding;
+
+template <class Machine> struct MachineTraits {};
+
+template <class Machine> class TargetX86Base : public Machine {
+  TargetX86Base() = delete;
+  TargetX86Base(const TargetX86Base &) = delete;
+  TargetX86Base &operator=(const TargetX86Base &) = delete;
+
+protected:
+  using TargetLowering::H_bitcast_16xi1_i16;
+  using TargetLowering::H_bitcast_8xi1_i8;
+  using TargetLowering::H_bitcast_i16_16xi1;
+  using TargetLowering::H_bitcast_i8_8xi1;
+  using TargetLowering::H_call_ctpop_i32;
+  using TargetLowering::H_call_ctpop_i64;
+  using TargetLowering::H_call_longjmp;
+  using TargetLowering::H_call_memcpy;
+  using TargetLowering::H_call_memmove;
+  using TargetLowering::H_call_memset;
+  using TargetLowering::H_call_read_tp;
+  using TargetLowering::H_call_setjmp;
+  using TargetLowering::H_fptosi_f32_i64;
+  using TargetLowering::H_fptosi_f64_i64;
+  using TargetLowering::H_fptoui_4xi32_f32;
+  using TargetLowering::H_fptoui_f32_i32;
+  using TargetLowering::H_fptoui_f32_i64;
+  using TargetLowering::H_fptoui_f64_i32;
+  using TargetLowering::H_fptoui_f64_i64;
+  using TargetLowering::H_frem_f32;
+  using TargetLowering::H_frem_f64;
+  using TargetLowering::H_sdiv_i64;
+  using TargetLowering::H_sitofp_i64_f32;
+  using TargetLowering::H_sitofp_i64_f64;
+  using TargetLowering::H_srem_i64;
+  using TargetLowering::H_udiv_i64;
+  using TargetLowering::H_uitofp_4xi32_4xf32;
+  using TargetLowering::H_uitofp_i32_f32;
+  using TargetLowering::H_uitofp_i32_f64;
+  using TargetLowering::H_uitofp_i64_f32;
+  using TargetLowering::H_uitofp_i64_f64;
+  using TargetLowering::H_urem_i64;
+
+  using TargetLowering::alignStackSpillAreas;
+  using TargetLowering::assignVarStackSlots;
+  using TargetLowering::inferTwoAddress;
+  using TargetLowering::makeHelperCall;
+  using TargetLowering::getVarStackSlotParams;
+
+public:
+  using Traits = MachineTraits<Machine>;
+  using BoolFolding = ::Ice::X86Internal::BoolFolding<Traits>;
+
+  using TargetLowering::RegSet_All;
+  using TargetLowering::RegSet_CalleeSave;
+  using TargetLowering::RegSet_CallerSave;
+  using TargetLowering::RegSet_FramePointer;
+  using TargetLowering::RegSet_None;
+  using TargetLowering::RegSet_StackPointer;
+  using TargetLowering::Context;
+  using TargetLowering::Ctx;
+  using TargetLowering::Func;
+  using TargetLowering::RegSetMask;
+
+  using TargetLowering::_bundle_lock;
+  using TargetLowering::_bundle_unlock;
+  using TargetLowering::getContext;
+  using TargetLowering::getStackAdjustment;
+  using TargetLowering::regAlloc;
+  using TargetLowering::resetStackAdjustment;
+
+  static TargetX86Base *create(Cfg *Func) { return new TargetX86Base(Func); }
+
+  void translateOm1() override;
+  void translateO2() override;
+  void doLoadOpt();
+  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
+
+  SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
+  Variable *getPhysicalRegister(SizeT RegNum, Type Ty = IceType_void) override;
+  IceString getRegName(SizeT RegNum, Type Ty) const override;
+  llvm::SmallBitVector getRegisterSet(RegSetMask Include,
+                                      RegSetMask Exclude) const override;
+  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
+    return TypeToRegisterSet[Ty];
+  }
+  bool hasFramePointer() const override { return IsEbpBasedFrame; }
+  SizeT getFrameOrStackReg() const override {
+    return IsEbpBasedFrame ? RegX8632::Reg_ebp : RegX8632::Reg_esp;
+  }
+  size_t typeWidthInBytesOnStack(Type Ty) const override {
+    // Round up to the next multiple of 4 bytes.  In particular, i1,
+    // i8, and i16 are rounded up to 4 bytes.
+    return (typeWidthInBytes(Ty) + 3) & ~3;
+  }
+
+  void emitVariable(const Variable *Var) const override;
+
+  const char *getConstantPrefix() const final { return "$"; }
+  void emit(const ConstantUndef *C) const final;
+  void emit(const ConstantInteger32 *C) const final;
+  void emit(const ConstantInteger64 *C) const final;
+  void emit(const ConstantFloat *C) const final;
+  void emit(const ConstantDouble *C) const final;
+
+  void lowerArguments() override;
+  void initNodeForLowering(CfgNode *Node) override;
+  void addProlog(CfgNode *Node) override;
+  void addEpilog(CfgNode *Node) override;
+  // Ensure that a 64-bit Variable has been split into 2 32-bit
+  // Variables, creating them if necessary.  This is needed for all
+  // I64 operations, and it is needed for pushing F64 arguments for
+  // function calls using the 32-bit push instruction (though the
+  // latter could be done by directly writing to the stack).
+  void split64(Variable *Var);
+  Operand *loOperand(Operand *Operand);
+  Operand *hiOperand(Operand *Operand);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
+  X8632::Address stackVarToAsmOperand(const Variable *Var) const final;
+
+  typename Traits::InstructionSet getInstructionSet() const final {
+    return InstructionSet;
+  }
+
+protected:
+  explicit TargetX86Base(Cfg *Func);
+
+  void postLower() override;
+
+  void lowerAlloca(const InstAlloca *Inst) override;
+  void lowerArithmetic(const InstArithmetic *Inst) override;
+  void lowerAssign(const InstAssign *Inst) override;
+  void lowerBr(const InstBr *Inst) override;
+  void lowerCall(const InstCall *Inst) override;
+  void lowerCast(const InstCast *Inst) override;
+  void lowerExtractElement(const InstExtractElement *Inst) override;
+  void lowerFcmp(const InstFcmp *Inst) override;
+  void lowerIcmp(const InstIcmp *Inst) override;
+  void lowerIntrinsicCall(const InstIntrinsicCall *Inst) override;
+  void lowerInsertElement(const InstInsertElement *Inst) override;
+  void lowerLoad(const InstLoad *Inst) override;
+  void lowerPhi(const InstPhi *Inst) override;
+  void lowerRet(const InstRet *Inst) override;
+  void lowerSelect(const InstSelect *Inst) override;
+  void lowerStore(const InstStore *Inst) override;
+  void lowerSwitch(const InstSwitch *Inst) override;
+  void lowerUnreachable(const InstUnreachable *Inst) override;
+  void lowerOther(const Inst *Instr) override;
+  void lowerRMW(const InstX8632FakeRMW *RMW);
+  void prelowerPhis() override;
+  void lowerPhiAssignments(CfgNode *Node,
+                           const AssignList &Assignments) override;
+  void doAddressOptLoad() override;
+  void doAddressOptStore() override;
+  void randomlyInsertNop(float Probability) override;
+
+  // Naive lowering of cmpxchg.
+  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+                          Operand *Desired);
+  // Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
+  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
+                                Operand *Expected, Operand *Desired);
+  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                      Operand *Val);
+  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
+                       Operand *SecondVal);
+
+  typedef void (TargetX86Base::*LowerBinOp)(Variable *, Operand *);
+  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+                                Variable *Dest, Operand *Ptr, Operand *Val);
+
+  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
+
+  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
+                           Operand *Src0, Operand *Src1);
+
+  // Operand legalization helpers.  To deal with address mode
+  // constraints, the helpers will create a new Operand and emit
+  // instructions that guarantee that the Operand kind is one of those
+  // indicated by the LegalMask (a bitmask of allowed kinds).  If the
+  // input Operand is known to already meet the constraints, it may be
+  // simply returned as the result, without creating any new
+  // instructions or operands.
+  enum OperandLegalization {
+    Legal_None = 0,
+    Legal_Reg = 1 << 0, // physical register, not stack location
+    Legal_Imm = 1 << 1,
+    Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
+    Legal_All = ~Legal_None
+  };
+  typedef uint32_t LegalMask;
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+                    int32_t RegNum = Variable::NoRegister);
+  Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
+  // Legalize the first source operand for use in the cmp instruction.
+  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
+  // Turn a pointer operand into a memory operand that can be
+  // used by a real load/store operation. Legalizes the operand as well.
+  // This is a nop if the operand is already a legal memory operand.
+  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                     bool DoLegalize = true);
+
+  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
+  static Type stackSlotType();
+
+  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
+
+  // Returns a vector in a register with the given constant entries.
+  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfMinusOnes(Type Ty,
+                                  int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfHighOrderBits(Type Ty,
+                                      int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfFabsMask(Type Ty,
+                                 int32_t RegNum = Variable::NoRegister);
+
+  // Return a memory operand corresponding to a stack allocated Variable.
+  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                                uint32_t Offset = 0);
+
+  void makeRandomRegisterPermutation(
+      llvm::SmallVectorImpl<int32_t> &Permutation,
+      const llvm::SmallBitVector &ExcludeRegisters) const override;
+
+  // The following are helpers that insert lowered x86 instructions
+  // with minimal syntactic overhead, so that the lowering code can
+  // look as close to assembly as practical.
+  void _adc(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Adc::create(Func, Dest, Src0));
+  }
+  void _adc_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632AdcRMW::create(Func, DestSrc0, Src1));
+  }
+  void _add(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Add::create(Func, Dest, Src0));
+  }
+  void _add_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632AddRMW::create(Func, DestSrc0, Src1));
+  }
+  void _adjust_stack(int32_t Amount) {
+    Context.insert(InstX8632AdjustStack::create(
+        Func, Amount, getPhysicalRegister(RegX8632::Reg_esp)));
+  }
+  void _addps(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Addps::create(Func, Dest, Src0));
+  }
+  void _addss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Addss::create(Func, Dest, Src0));
+  }
+  void _and(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632And::create(Func, Dest, Src0));
+  }
+  void _and_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632AndRMW::create(Func, DestSrc0, Src1));
+  }
+  void _blendvps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Blendvps::create(Func, Dest, Src0, Src1));
+  }
+  void _br(CondX86::BrCond Condition, CfgNode *TargetTrue,
+           CfgNode *TargetFalse) {
+    Context.insert(
+        InstX8632Br::create(Func, TargetTrue, TargetFalse, Condition));
+  }
+  void _br(CfgNode *Target) {
+    Context.insert(InstX8632Br::create(Func, Target));
+  }
+  void _br(CondX86::BrCond Condition, CfgNode *Target) {
+    Context.insert(InstX8632Br::create(Func, Target, Condition));
+  }
+  void _br(CondX86::BrCond Condition, InstX8632Label *Label) {
+    Context.insert(InstX8632Br::create(Func, Label, Condition));
+  }
+  void _bsf(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Bsf::create(Func, Dest, Src0));
+  }
+  void _bsr(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Bsr::create(Func, Dest, Src0));
+  }
+  void _bswap(Variable *SrcDest) {
+    Context.insert(InstX8632Bswap::create(Func, SrcDest));
+  }
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
+  }
+  void _cmov(Variable *Dest, Operand *Src0, CondX86::BrCond Condition) {
+    Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));
+  }
+  void _cmp(Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
+  }
+  void _cmpps(Variable *Dest, Operand *Src0, CondX86::CmppsCond Condition) {
+    Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
+  }
+  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+                bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg::create(Func, DestOrAddr, Eax, Desired, Locked));
+    // Mark eax as possibly modified by cmpxchg.
+    Context.insert(
+        InstFakeDef::create(Func, Eax, llvm::dyn_cast<Variable>(DestOrAddr)));
+    _set_dest_nonkillable();
+    Context.insert(InstFakeUse::create(Func, Eax));
+  }
+  void _cmpxchg8b(OperandX8632Mem *Addr, Variable *Edx, Variable *Eax,
+                  Variable *Ecx, Variable *Ebx, bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg8b::create(Func, Addr, Edx, Eax, Ecx, Ebx, Locked));
+    // Mark edx, and eax as possibly modified by cmpxchg8b.
+    Context.insert(InstFakeDef::create(Func, Edx));
+    _set_dest_nonkillable();
+    Context.insert(InstFakeUse::create(Func, Edx));
+    Context.insert(InstFakeDef::create(Func, Eax));
+    _set_dest_nonkillable();
+    Context.insert(InstFakeUse::create(Func, Eax));
+  }
+  void _cvt(Variable *Dest, Operand *Src0, InstX8632Cvt::CvtVariant Variant) {
+    Context.insert(InstX8632Cvt::create(Func, Dest, Src0, Variant));
+  }
+  void _div(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Div::create(Func, Dest, Src0, Src1));
+  }
+  void _divps(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Divps::create(Func, Dest, Src0));
+  }
+  void _divss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Divss::create(Func, Dest, Src0));
+  }
+  void _fld(Operand *Src0) { Context.insert(InstX8632Fld::create(Func, Src0)); }
+  void _fstp(Variable *Dest) {
+    Context.insert(InstX8632Fstp::create(Func, Dest));
+  }
+  void _idiv(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Idiv::create(Func, Dest, Src0, Src1));
+  }
+  void _imul(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Imul::create(Func, Dest, Src0));
+  }
+  void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
+  }
+  void _jmp(Operand *Target) {
+    Context.insert(InstX8632Jmp::create(Func, Target));
+  }
+  void _lea(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Lea::create(Func, Dest, Src0));
+  }
+  void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
+  // If Dest=nullptr is passed in, then a new variable is created,
+  // marked as infinite register allocation weight, and returned
+  // through the in/out Dest argument.
+  void _mov(Variable *&Dest, Operand *Src0,
+            int32_t RegNum = Variable::NoRegister) {
+    if (Dest == nullptr)
+      Dest = makeReg(Src0->getType(), RegNum);
+    Context.insert(InstX8632Mov::create(Func, Dest, Src0));
+  }
+  void _mov_nonkillable(Variable *Dest, Operand *Src0) {
+    Inst *NewInst = InstX8632Mov::create(Func, Dest, Src0);
+    NewInst->setDestNonKillable();
+    Context.insert(NewInst);
+  }
+  void _movd(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movd::create(Func, Dest, Src0));
+  }
+  void _movp(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movp::create(Func, Dest, Src0));
+  }
+  void _movq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
+  }
+  void _movss(Variable *Dest, Variable *Src0) {
+    Context.insert(InstX8632MovssRegs::create(Func, Dest, Src0));
+  }
+  void _movsx(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
+  }
+  void _movzx(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movzx::create(Func, Dest, Src0));
+  }
+  void _mul(Variable *Dest, Variable *Src0, Operand *Src1) {
+    Context.insert(InstX8632Mul::create(Func, Dest, Src0, Src1));
+  }
+  void _mulps(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Mulps::create(Func, Dest, Src0));
+  }
+  void _mulss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Mulss::create(Func, Dest, Src0));
+  }
+  void _neg(Variable *SrcDest) {
+    Context.insert(InstX8632Neg::create(Func, SrcDest));
+  }
+  void _nop(SizeT Variant) {
+    Context.insert(InstX8632Nop::create(Func, Variant));
+  }
+  void _or(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Or::create(Func, Dest, Src0));
+  }
+  void _or_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632OrRMW::create(Func, DestSrc0, Src1));
+  }
+  void _padd(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Padd::create(Func, Dest, Src0));
+  }
+  void _pand(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pand::create(Func, Dest, Src0));
+  }
+  void _pandn(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pandn::create(Func, Dest, Src0));
+  }
+  void _pblendvb(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pblendvb::create(Func, Dest, Src0, Src1));
+  }
+  void _pcmpeq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pcmpeq::create(Func, Dest, Src0));
+  }
+  void _pcmpgt(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
+  }
+  void _pextr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pextr::create(Func, Dest, Src0, Src1));
+  }
+  void _pinsr(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pinsr::create(Func, Dest, Src0, Src1));
+  }
+  void _pmull(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pmull::create(Func, Dest, Src0));
+  }
+  void _pmuludq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pmuludq::create(Func, Dest, Src0));
+  }
+  void _pop(Variable *Dest) {
+    Context.insert(InstX8632Pop::create(Func, Dest));
+  }
+  void _por(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Por::create(Func, Dest, Src0));
+  }
+  void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pshufd::create(Func, Dest, Src0, Src1));
+  }
+  void _psll(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psll::create(Func, Dest, Src0));
+  }
+  void _psra(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psra::create(Func, Dest, Src0));
+  }
+  void _psrl(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psrl::create(Func, Dest, Src0));
+  }
+  void _psub(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psub::create(Func, Dest, Src0));
+  }
+  void _push(Variable *Src0) {
+    Context.insert(InstX8632Push::create(Func, Src0));
+  }
+  void _pxor(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Pxor::create(Func, Dest, Src0));
+  }
+  void _ret(Variable *Src0 = nullptr) {
+    Context.insert(InstX8632Ret::create(Func, Src0));
+  }
+  void _rol(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Rol::create(Func, Dest, Src0));
+  }
+  void _sar(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Sar::create(Func, Dest, Src0));
+  }
+  void _sbb(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Sbb::create(Func, Dest, Src0));
+  }
+  void _sbb_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632SbbRMW::create(Func, DestSrc0, Src1));
+  }
+  void _setcc(Variable *Dest, CondX86::BrCond Condition) {
+    Context.insert(InstX8632Setcc::create(Func, Dest, Condition));
+  }
+  void _shl(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Shl::create(Func, Dest, Src0));
+  }
+  void _shld(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstX8632Shld::create(Func, Dest, Src0, Src1));
+  }
+  void _shr(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Shr::create(Func, Dest, Src0));
+  }
+  void _shrd(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstX8632Shrd::create(Func, Dest, Src0, Src1));
+  }
+  void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Shufps::create(Func, Dest, Src0, Src1));
+  }
+  void _sqrtss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Sqrtss::create(Func, Dest, Src0));
+  }
+  void _store(Operand *Value, OperandX8632 *Mem) {
+    Context.insert(InstX8632Store::create(Func, Value, Mem));
+  }
+  void _storep(Variable *Value, OperandX8632Mem *Mem) {
+    Context.insert(InstX8632StoreP::create(Func, Value, Mem));
+  }
+  void _storeq(Variable *Value, OperandX8632Mem *Mem) {
+    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
+  }
+  void _sub(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Sub::create(Func, Dest, Src0));
+  }
+  void _sub_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632SubRMW::create(Func, DestSrc0, Src1));
+  }
+  void _subps(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Subps::create(Func, Dest, Src0));
+  }
+  void _subss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Subss::create(Func, Dest, Src0));
+  }
+  void _test(Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Test::create(Func, Src0, Src1));
+  }
+  void _ucomiss(Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Ucomiss::create(Func, Src0, Src1));
+  }
+  void _ud2() { Context.insert(InstX8632UD2::create(Func)); }
+  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
+    Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
+    // The xadd exchanges Dest and Src (modifying Src).
+    // Model that update with a FakeDef followed by a FakeUse.
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
+    _set_dest_nonkillable();
+    Context.insert(InstFakeUse::create(Func, Src));
+  }
+  void _xchg(Operand *Dest, Variable *Src) {
+    Context.insert(InstX8632Xchg::create(Func, Dest, Src));
+    // The xchg modifies Dest and Src -- model that update with a
+    // FakeDef/FakeUse.
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
+    _set_dest_nonkillable();
+    Context.insert(InstFakeUse::create(Func, Src));
+  }
+  void _xor(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Xor::create(Func, Dest, Src0));
+  }
+  void _xor_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632XorRMW::create(Func, DestSrc0, Src1));
+  }
+  void _set_dest_nonkillable() {
+    Context.getLastInserted()->setDestNonKillable();
+  }
+
+  bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
+  void findRMW();
+
+  typename Traits::InstructionSet InstructionSet =
+      Traits::InstructionSet::Begin;
+  bool IsEbpBasedFrame = false;
+  bool NeedsStackAlignment = false;
+  size_t SpillAreaSizeBytes = 0;
+  llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  llvm::SmallBitVector ScratchRegs;
+  llvm::SmallBitVector RegsUsed;
+  VarList PhysicalRegisters[IceType_NUM];
+  static IceString RegNames[];
+
+  // Randomize a given immediate operand
+  Operand *randomizeOrPoolImmediate(Constant *Immediate,
+                                    int32_t RegNum = Variable::NoRegister);
+  OperandX8632Mem *
+  randomizeOrPoolImmediate(OperandX8632Mem *MemOperand,
+                           int32_t RegNum = Variable::NoRegister);
+  bool RandomizationPoolingPaused = false;
+
+private:
+  ~TargetX86Base() override {}
+  BoolFolding FoldingInfo;
+};
+} // end of namespace X86Internal
+} // end of namespace Ice
+
+#include "IceTargetLoweringX86BaseImpl.h"
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASE_H

diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
new file mode 100644
index 0000000..b787c03
--- /dev/null
+++ b/src/IceTargetLoweringX86BaseImpl.h

@@ -0,0 +1,5503 @@
+//===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetLoweringX86Base class, which
+// consists almost entirely of the lowering sequence for each
+// high-level instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
+
+#include "llvm/Support/MathExtras.h"
+
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceClFlags.h"
+#include "IceDefs.h"
+#include "IceELFObjectWriter.h"
+#include "IceGlobalInits.h"
+#include "IceInstX8632.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+#include "IceRegistersX8632.h"
+#include "IceTargetLoweringX8632.def"
+#include "IceTargetLoweringX8632.h"
+#include "IceUtils.h"
+
+namespace Ice {
+namespace X86Internal {
+
+// A helper class to ease the settings of RandomizationPoolingPause
+// to disable constant blinding or pooling for some translation phases.
+class BoolFlagSaver {
+  BoolFlagSaver() = delete;
+  BoolFlagSaver(const BoolFlagSaver &) = delete;
+  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
+
+public:
+  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
+  ~BoolFlagSaver() { Flag = OldValue; }
+
+private:
+  const bool OldValue;
+  bool &Flag;
+};
+
+template <class MachineTraits> class BoolFoldingEntry {
+  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
+
+public:
+  BoolFoldingEntry() = default;
+  explicit BoolFoldingEntry(Inst *I);
+  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
+  // Instr is the instruction producing the i1-type variable of interest.
+  Inst *Instr = nullptr;
+  // IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
+  bool IsComplex = false;
+  // IsLiveOut is initialized conservatively to true, and is set to false when
+  // we encounter an instruction that ends Var's live range.  We disable the
+  // folding optimization when Var is live beyond this basic block.  Note that
+  // if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
+  // always be true and the folding optimization will never be performed.
+  bool IsLiveOut = true;
+  // NumUses counts the number of times Var is used as a source operand in the
+  // basic block.  If IsComplex is true and there is more than one use of Var,
+  // then the folding optimization is disabled for Var.
+  uint32_t NumUses = 0;
+};
+
+template <class MachineTraits> class BoolFolding {
+public:
+  enum BoolFoldingProducerKind {
+    PK_None,
+    PK_Icmp32,
+    PK_Icmp64,
+    PK_Fcmp,
+    PK_Trunc
+  };
+
+  // Currently the actual enum values are not used (other than CK_None), but we
+  // go
+  // ahead and produce them anyway for symmetry with the
+  // BoolFoldingProducerKind.
+  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
+
+private:
+  BoolFolding(const BoolFolding &) = delete;
+  BoolFolding &operator=(const BoolFolding &) = delete;
+
+public:
+  BoolFolding() = default;
+  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
+  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
+  static bool hasComplexLowering(const Inst *Instr);
+  void init(CfgNode *Node);
+  const Inst *getProducerFor(const Operand *Opnd) const;
+  void dump(const Cfg *Func) const;
+
+private:
+  // Returns true if Producers contains a valid entry for the given VarNum.
+  bool containsValid(SizeT VarNum) const {
+    auto Element = Producers.find(VarNum);
+    return Element != Producers.end() && Element->second.Instr != nullptr;
+  }
+  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
+  // Producers maps Variable::Number to a BoolFoldingEntry.
+  std::unordered_map<SizeT, BoolFoldingEntry<MachineTraits>> Producers;
+};
+
+template <class MachineTraits>
+BoolFoldingEntry<MachineTraits>::BoolFoldingEntry(Inst *I)
+    : Instr(I), IsComplex(BoolFolding<MachineTraits>::hasComplexLowering(I)) {}
+
+template <class MachineTraits>
+typename BoolFolding<MachineTraits>::BoolFoldingProducerKind
+BoolFolding<MachineTraits>::getProducerKind(const Inst *Instr) {
+  if (llvm::isa<InstIcmp>(Instr)) {
+    if (Instr->getSrc(0)->getType() != IceType_i64)
+      return PK_Icmp32;
+    return PK_None; // TODO(stichnot): actually PK_Icmp64;
+  }
+  return PK_None; // TODO(stichnot): remove this
+
+  if (llvm::isa<InstFcmp>(Instr))
+    return PK_Fcmp;
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return PK_None;
+    case InstCast::Trunc:
+      return PK_Trunc;
+    }
+  }
+  return PK_None;
+}
+
+template <class MachineTraits>
+typename BoolFolding<MachineTraits>::BoolFoldingConsumerKind
+BoolFolding<MachineTraits>::getConsumerKind(const Inst *Instr) {
+  if (llvm::isa<InstBr>(Instr))
+    return CK_Br;
+  if (llvm::isa<InstSelect>(Instr))
+    return CK_Select;
+  return CK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return CK_None;
+    case InstCast::Sext:
+      return CK_Sext;
+    case InstCast::Zext:
+      return CK_Zext;
+    }
+  }
+  return CK_None;
+}
+
+// Returns true if the producing instruction has a "complex" lowering
+// sequence.  This generally means that its lowering sequence requires
+// more than one conditional branch, namely 64-bit integer compares
+// and some floating-point compares.  When this is true, and there is
+// more than one consumer, we prefer to disable the folding
+// optimization because it minimizes branches.
+template <class MachineTraits>
+bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) {
+  switch (getProducerKind(Instr)) {
+  default:
+    return false;
+  case PK_Icmp64:
+    return true;
+  case PK_Fcmp:
+    return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
+               .C2 != CondX86::Br_None;
+  }
+}
+
+template <class MachineTraits>
+void BoolFolding<MachineTraits>::init(CfgNode *Node) {
+  Producers.clear();
+  for (Inst &Instr : Node->getInsts()) {
+    // Check whether Instr is a valid producer.
+    Variable *Var = Instr.getDest();
+    if (!Instr.isDeleted() // only consider non-deleted instructions
+        && Var             // only instructions with an actual dest var
+        && Var->getType() == IceType_i1          // only bool-type dest vars
+        && getProducerKind(&Instr) != PK_None) { // white-listed instructions
+      Producers[Var->getIndex()] = BoolFoldingEntry<MachineTraits>(&Instr);
+    }
+    // Check each src variable against the map.
+    for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
+      Operand *Src = Instr.getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        SizeT VarNum = Var->getIndex();
+        if (containsValid(VarNum)) {
+          if (I != 0 // All valid consumers use Var as the first source operand
+              || getConsumerKind(&Instr) == CK_None // must be white-listed
+              || (Producers[VarNum].IsComplex && // complex can't be multi-use
+                  Producers[VarNum].NumUses > 0)) {
+            setInvalid(VarNum);
+            continue;
+          }
+          ++Producers[VarNum].NumUses;
+          if (Instr.isLastUse(Var)) {
+            Producers[VarNum].IsLiveOut = false;
+          }
+        }
+      }
+    }
+  }
+  for (auto &I : Producers) {
+    // Ignore entries previously marked invalid.
+    if (I.second.Instr == nullptr)
+      continue;
+    // Disable the producer if its dest may be live beyond this block.
+    if (I.second.IsLiveOut) {
+      setInvalid(I.first);
+      continue;
+    }
+    // Mark as "dead" rather than outright deleting.  This is so that
+    // other peephole style optimizations during or before lowering
+    // have access to this instruction in undeleted form.  See for
+    // example tryOptimizedCmpxchgCmpBr().
+    I.second.Instr->setDead();
+  }
+}
+
+template <class MachineTraits>
+const Inst *
+BoolFolding<MachineTraits>::getProducerFor(const Operand *Opnd) const {
+  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
+  if (Var == nullptr)
+    return nullptr;
+  SizeT VarNum = Var->getIndex();
+  auto Element = Producers.find(VarNum);
+  if (Element == Producers.end())
+    return nullptr;
+  return Element->second.Instr;
+}
+
+template <class MachineTraits>
+void BoolFolding<MachineTraits>::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (auto &I : Producers) {
+    if (I.second.Instr == nullptr)
+      continue;
+    Str << "Found foldable producer:\n  ";
+    I.second.Instr->dump(Func);
+    Str << "\n";
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::initNodeForLowering(CfgNode *Node) {
+  FoldingInfo.init(Node);
+  FoldingInfo.dump(Func);
+}
+
+template <class Machine>
+TargetX86Base<Machine>::TargetX86Base(Cfg *Func)
+    : Machine(Func) {
+  static_assert(
+      (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
+          (TargetInstructionSet::X86InstructionSet_End -
+           TargetInstructionSet::X86InstructionSet_Begin),
+      "Traits::InstructionSet range different from TargetInstructionSet");
+  if (Func->getContext()->getFlags().getTargetInstructionSet() !=
+      TargetInstructionSet::BaseInstructionSet) {
+    InstructionSet = static_cast<typename Traits::InstructionSet>(
+        (Func->getContext()->getFlags().getTargetInstructionSet() -
+         TargetInstructionSet::X86InstructionSet_Begin) +
+        Traits::InstructionSet::Begin);
+  }
+  // TODO: Don't initialize IntegerRegisters and friends every time.
+  // Instead, initialize in some sort of static initializer for the
+  // class.
+  llvm::SmallBitVector IntegerRegisters(RegX8632::Reg_NUM);
+  llvm::SmallBitVector IntegerRegistersI8(RegX8632::Reg_NUM);
+  llvm::SmallBitVector FloatRegisters(RegX8632::Reg_NUM);
+  llvm::SmallBitVector VectorRegisters(RegX8632::Reg_NUM);
+  llvm::SmallBitVector InvalidRegisters(RegX8632::Reg_NUM);
+  ScratchRegs.resize(RegX8632::Reg_NUM);
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  IntegerRegisters[RegX8632::val] = isInt;                                     \
+  IntegerRegistersI8[RegX8632::val] = isI8;                                    \
+  FloatRegisters[RegX8632::val] = isFP;                                        \
+  VectorRegisters[RegX8632::val] = isFP;                                       \
+  ScratchRegs[RegX8632::val] = scratch;
+  REGX8632_TABLE;
+#undef X
+  TypeToRegisterSet[IceType_void] = InvalidRegisters;
+  TypeToRegisterSet[IceType_i1] = IntegerRegistersI8;
+  TypeToRegisterSet[IceType_i8] = IntegerRegistersI8;
+  TypeToRegisterSet[IceType_i16] = IntegerRegisters;
+  TypeToRegisterSet[IceType_i32] = IntegerRegisters;
+  TypeToRegisterSet[IceType_i64] = IntegerRegisters;
+  TypeToRegisterSet[IceType_f32] = FloatRegisters;
+  TypeToRegisterSet[IceType_f64] = FloatRegisters;
+  TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
+  TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
+  TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
+  TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
+  TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
+  TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
+  TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
+}
+
+template <class Machine> void TargetX86Base<Machine>::translateO2() {
+  TimerMarker T(TimerStack::TT_O2, Func);
+
+  if (!Ctx->getFlags().getPhiEdgeSplit()) {
+    // Lower Phi instructions.
+    Func->placePhiLoads();
+    if (Func->hasError())
+      return;
+    Func->placePhiStores();
+    if (Func->hasError())
+      return;
+    Func->deletePhis();
+    if (Func->hasError())
+      return;
+    Func->dump("After Phi lowering");
+  }
+
+  // Address mode optimization.
+  Func->getVMetadata()->init(VMK_SingleDefs);
+  Func->doAddressOpt();
+
+  // Find read-modify-write opportunities.  Do this after address mode
+  // optimization so that doAddressOpt() doesn't need to be applied to RMW
+  // instructions as well.
+  findRMW();
+  Func->dump("After RMW transform");
+
+  // Argument lowering
+  Func->doArgLowering();
+
+  // Target lowering.  This requires liveness analysis for some parts
+  // of the lowering decisions, such as compare/branch fusing.  If
+  // non-lightweight liveness analysis is used, the instructions need
+  // to be renumbered first.  TODO: This renumbering should only be
+  // necessary if we're actually calculating live intervals, which we
+  // only do for register allocation.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+
+  // TODO: It should be sufficient to use the fastest liveness
+  // calculation, i.e. livenessLightweight().  However, for some
+  // reason that slows down the rest of the translation.  Investigate.
+  Func->liveness(Liveness_Basic);
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 address mode opt");
+
+  // Disable constant blinding or pooling for load optimization.
+  {
+    BoolFlagSaver B(RandomizationPoolingPaused, true);
+    doLoadOpt();
+  }
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After x86 codegen");
+
+  // Register allocation.  This requires instruction renumbering and
+  // full liveness analysis.
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  Func->liveness(Liveness_Intervals);
+  if (Func->hasError())
+    return;
+  // Validate the live range computations.  The expensive validation
+  // call is deliberately only made when assertions are enabled.
+  assert(Func->validateLiveness());
+  // The post-codegen dump is done here, after liveness analysis and
+  // associated cleanup, to make the dump cleaner and more useful.
+  Func->dump("After initial x8632 codegen");
+  Func->getVMetadata()->init(VMK_All);
+  regAlloc(RAK_Global);
+  if (Func->hasError())
+    return;
+  Func->dump("After linear scan regalloc");
+
+  if (Ctx->getFlags().getPhiEdgeSplit()) {
+    // We need to pause constant blinding or pooling during advanced
+    // phi lowering, unless the lowering assignment has a physical
+    // register for the dest Variable.
+    {
+      BoolFlagSaver B(RandomizationPoolingPaused, true);
+      Func->advancedPhiLowering();
+    }
+    Func->dump("After advanced Phi lowering");
+  }
+
+  // Stack frame mapping.
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+
+  Func->contractEmptyNodes();
+  Func->reorderNodes();
+
+  // Branch optimization.  This needs to be done just before code
+  // emission.  In particular, no transformations that insert or
+  // reorder CfgNodes should be done after branch optimization.  We go
+  // ahead and do it before nop insertion to reduce the amount of work
+  // needed for searching for opportunities.
+  Func->doBranchOpt();
+  Func->dump("After branch optimization");
+
+  // Nop insertion
+  if (Ctx->getFlags().shouldDoNopInsertion()) {
+    Func->doNopInsertion();
+  }
+}
+
+template <class Machine> void TargetX86Base<Machine>::translateOm1() {
+  TimerMarker T(TimerStack::TT_Om1, Func);
+
+  Func->placePhiLoads();
+  if (Func->hasError())
+    return;
+  Func->placePhiStores();
+  if (Func->hasError())
+    return;
+  Func->deletePhis();
+  if (Func->hasError())
+    return;
+  Func->dump("After Phi lowering");
+
+  Func->doArgLowering();
+
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  Func->dump("After initial x8632 codegen");
+
+  regAlloc(RAK_InfOnly);
+  if (Func->hasError())
+    return;
+  Func->dump("After regalloc of infinite-weight variables");
+
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  Func->dump("After stack frame mapping");
+
+  // Nop insertion
+  if (Ctx->getFlags().shouldDoNopInsertion()) {
+    Func->doNopInsertion();
+  }
+}
+
+bool canRMW(const InstArithmetic *Arith) {
+  Type Ty = Arith->getDest()->getType();
+  // X86 vector instructions write to a register and have no RMW
+  // option.
+  if (isVectorType(Ty))
+    return false;
+  bool isI64 = Ty == IceType_i64;
+
+  switch (Arith->getOp()) {
+  // Not handled for lack of simple lowering:
+  //   shift on i64
+  //   mul, udiv, urem, sdiv, srem, frem
+  // Not handled for lack of RMW instructions:
+  //   fadd, fsub, fmul, fdiv (also vector types)
+  default:
+    return false;
+  case InstArithmetic::Add:
+  case InstArithmetic::Sub:
+  case InstArithmetic::And:
+  case InstArithmetic::Or:
+  case InstArithmetic::Xor:
+    return true;
+  case InstArithmetic::Shl:
+  case InstArithmetic::Lshr:
+  case InstArithmetic::Ashr:
+    return false; // TODO(stichnot): implement
+    return !isI64;
+  }
+}
+
+bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
+  if (A == B)
+    return true;
+  if (auto *MemA = llvm::dyn_cast<OperandX8632Mem>(A)) {
+    if (auto *MemB = llvm::dyn_cast<OperandX8632Mem>(B)) {
+      return MemA->getBase() == MemB->getBase() &&
+             MemA->getOffset() == MemB->getOffset() &&
+             MemA->getIndex() == MemB->getIndex() &&
+             MemA->getShift() == MemB->getShift() &&
+             MemA->getSegmentRegister() == MemB->getSegmentRegister();
+    }
+  }
+  return false;
+}
+
+template <class Machine> void TargetX86Base<Machine>::findRMW() {
+  Func->dump("Before RMW");
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (CfgNode *Node : Func->getNodes()) {
+    // Walk through the instructions, considering each sequence of 3
+    // instructions, and look for the particular RMW pattern.  Note that this
+    // search can be "broken" (false negatives) if there are intervening deleted
+    // instructions, or intervening instructions that could be safely moved out
+    // of the way to reveal an RMW pattern.
+    auto E = Node->getInsts().end();
+    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
+    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
+      // Make I3 skip over deleted instructions.
+      while (I3 != E && I3->isDeleted())
+        ++I3;
+      if (I1 == E || I2 == E || I3 == E)
+        continue;
+      assert(!I1->isDeleted());
+      assert(!I2->isDeleted());
+      assert(!I3->isDeleted());
+      if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) {
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) {
+          if (auto *Store = llvm::dyn_cast<InstStore>(I3)) {
+            // Look for:
+            //   a = Load addr
+            //   b = <op> a, other
+            //   Store b, addr
+            // Change to:
+            //   a = Load addr
+            //   b = <op> a, other
+            //   x = FakeDef
+            //   RMW <op>, addr, other, x
+            //   b = Store b, addr, x
+            // Note that inferTwoAddress() makes sure setDestNonKillable() gets
+            // called on the updated Store instruction, to avoid liveness
+            // problems later.
+            //
+            // With this transformation, the Store instruction acquires a Dest
+            // variable and is now subject to dead code elimination if there are
+            // no more uses of "b".  Variable "x" is a beacon for determining
+            // whether the Store instruction gets dead-code eliminated.  If the
+            // Store instruction is eliminated, then it must be the case that
+            // the RMW instruction ends x's live range, and therefore the RMW
+            // instruction will be retained and later lowered.  On the other
+            // hand, if the RMW instruction does not end x's live range, then
+            // the Store instruction must still be present, and therefore the
+            // RMW instruction is ignored during lowering because it is
+            // redundant with the Store instruction.
+            //
+            // Note that if "a" has further uses, the RMW transformation may
+            // still trigger, resulting in two loads and one store, which is
+            // worse than the original one load and one store.  However, this is
+            // probably rare, and caching probably keeps it just as fast.
+            if (!isSameMemAddressOperand(Load->getSourceAddress(),
+                                         Store->getAddr()))
+              continue;
+            Operand *ArithSrcFromLoad = Arith->getSrc(0);
+            Operand *ArithSrcOther = Arith->getSrc(1);
+            if (ArithSrcFromLoad != Load->getDest()) {
+              if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
+                continue;
+              std::swap(ArithSrcFromLoad, ArithSrcOther);
+            }
+            if (Arith->getDest() != Store->getData())
+              continue;
+            if (!canRMW(Arith))
+              continue;
+            if (Func->isVerbose(IceV_RMW)) {
+              Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
+              Load->dump(Func);
+              Str << "\n  ";
+              Arith->dump(Func);
+              Str << "\n  ";
+              Store->dump(Func);
+              Str << "\n";
+            }
+            Variable *Beacon = Func->makeVariable(IceType_i32);
+            Beacon->setWeight(0);
+            Store->setRmwBeacon(Beacon);
+            InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
+            Node->getInsts().insert(I3, BeaconDef);
+            InstX8632FakeRMW *RMW = InstX8632FakeRMW::create(
+                Func, ArithSrcOther, Store->getAddr(), Beacon, Arith->getOp());
+            Node->getInsts().insert(I3, RMW);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+uint64_t getConstantMemoryOrder(Operand *Opnd) {
+  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return Integer->getValue();
+  return Intrinsics::MemoryOrderInvalid;
+}
+
+// Determines whether the dest of a Load instruction can be folded
+// into one of the src operands of a 2-operand instruction.  This is
+// true as long as the load dest matches exactly one of the binary
+// instruction's src operands.  Replaces Src0 or Src1 with LoadSrc if
+// the answer is true.
+bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+                               Operand *&Src0, Operand *&Src1) {
+  if (Src0 == LoadDest && Src1 != LoadDest) {
+    Src0 = LoadSrc;
+    return true;
+  }
+  if (Src0 != LoadDest && Src1 == LoadDest) {
+    Src1 = LoadSrc;
+    return true;
+  }
+  return false;
+}
+
+template <class Machine> void TargetX86Base<Machine>::doLoadOpt() {
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      Variable *LoadDest = nullptr;
+      Operand *LoadSrc = nullptr;
+      Inst *CurInst = Context.getCur();
+      Inst *Next = Context.getNextInst();
+      // Determine whether the current instruction is a Load
+      // instruction or equivalent.
+      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+        // An InstLoad always qualifies.
+        LoadDest = Load->getDest();
+        const bool DoLegalize = false;
+        LoadSrc = formMemoryOperand(Load->getSourceAddress(),
+                                    LoadDest->getType(), DoLegalize);
+      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
+        // An AtomicLoad intrinsic qualifies as long as it has a valid
+        // memory ordering, and can be implemented in a single
+        // instruction (i.e., not i64).
+        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
+        if (ID == Intrinsics::AtomicLoad &&
+            Intrin->getDest()->getType() != IceType_i64 &&
+            Intrinsics::isMemoryOrderValid(
+                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+          LoadDest = Intrin->getDest();
+          const bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+                                      DoLegalize);
+        }
+      }
+      // A Load instruction can be folded into the following
+      // instruction only if the following instruction ends the Load's
+      // Dest variable's live range.
+      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+        assert(LoadSrc);
+        Inst *NewInst = nullptr;
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+          Operand *Src0 = Arith->getSrc(0);
+          Operand *Src1 = Arith->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstArithmetic::create(Func, Arith->getOp(),
+                                             Arith->getDest(), Src0, Src1);
+          }
+        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+          Operand *Src0 = Icmp->getSrc(0);
+          Operand *Src1 = Icmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+                                       Icmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+          Operand *Src0 = Fcmp->getSrc(0);
+          Operand *Src1 = Fcmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+                                       Fcmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+          Operand *Src0 = Select->getTrueOperand();
+          Operand *Src1 = Select->getFalseOperand();
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstSelect::create(Func, Select->getDest(),
+                                         Select->getCondition(), Src0, Src1);
+          }
+        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+          // The load dest can always be folded into a Cast
+          // instruction.
+          Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+          if (Src0 == LoadDest) {
+            NewInst = InstCast::create(Func, Cast->getCastKind(),
+                                       Cast->getDest(), LoadSrc);
+          }
+        }
+        if (NewInst) {
+          CurInst->setDeleted();
+          Next->setDeleted();
+          Context.insert(NewInst);
+          // Update NewInst->LiveRangesEnded so that target lowering
+          // may benefit.  Also update NewInst->HasSideEffects.
+          NewInst->spliceLivenessInfo(Next, CurInst);
+        }
+      }
+      Context.advanceCur();
+      Context.advanceNext();
+    }
+  }
+  Func->dump("After load optimization");
+}
+
+template <class Machine>
+bool TargetX86Base<Machine>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
+  if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
+    return Br->optimizeBranch(NextNode);
+  }
+  return false;
+}
+
+template <class Machine>
+IceString TargetX86Base<Machine>::RegNames[] = {
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  name,
+    REGX8632_TABLE
+#undef X
+};
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::getPhysicalRegister(SizeT RegNum, Type Ty) {
+  if (Ty == IceType_void)
+    Ty = IceType_i32;
+  if (PhysicalRegisters[Ty].empty())
+    PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
+  assert(RegNum < PhysicalRegisters[Ty].size());
+  Variable *Reg = PhysicalRegisters[Ty][RegNum];
+  if (Reg == nullptr) {
+    Reg = Func->makeVariable(Ty);
+    Reg->setRegNum(RegNum);
+    PhysicalRegisters[Ty][RegNum] = Reg;
+    // Specially mark esp as an "argument" so that it is considered
+    // live upon function entry.
+    if (RegNum == RegX8632::Reg_esp) {
+      Func->addImplicitArg(Reg);
+      Reg->setIgnoreLiveness();
+    }
+  }
+  return Reg;
+}
+
+template <class Machine>
+IceString TargetX86Base<Machine>::getRegName(SizeT RegNum, Type Ty) const {
+  assert(RegNum < RegX8632::Reg_NUM);
+  static IceString RegNames8[] = {
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  name8,
+      REGX8632_TABLE
+#undef X
+  };
+  static IceString RegNames16[] = {
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  name16,
+      REGX8632_TABLE
+#undef X
+  };
+  switch (Ty) {
+  case IceType_i1:
+  case IceType_i8:
+    return RegNames8[RegNum];
+  case IceType_i16:
+    return RegNames16[RegNum];
+  default:
+    return RegNames[RegNum];
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emitVariable(const Variable *Var) const {
+  Ostream &Str = Ctx->getStrEmit();
+  if (Var->hasReg()) {
+    Str << "%" << getRegName(Var->getRegNum(), Var->getType());
+    return;
+  }
+  if (Var->getWeight().isInf()) {
+    llvm_unreachable("Infinite-weight Variable has no register assigned");
+  }
+  int32_t Offset = Var->getStackOffset();
+  if (!hasFramePointer())
+    Offset += getStackAdjustment();
+  if (Offset)
+    Str << Offset;
+  const Type FrameSPTy = IceType_i32;
+  Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
+}
+
+template <class Machine>
+X8632::Address
+TargetX86Base<Machine>::stackVarToAsmOperand(const Variable *Var) const {
+  if (Var->hasReg())
+    llvm_unreachable("Stack Variable has a register assigned");
+  if (Var->getWeight().isInf()) {
+    llvm_unreachable("Infinite-weight Variable has no register assigned");
+  }
+  int32_t Offset = Var->getStackOffset();
+  if (!hasFramePointer())
+    Offset += getStackAdjustment();
+  return X8632::Address(RegX8632::getEncodedGPR(getFrameOrStackReg()), Offset);
+}
+
+template <class Machine> void TargetX86Base<Machine>::lowerArguments() {
+  VarList &Args = Func->getArgs();
+  // The first four arguments of vector type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // passed in registers xmm0 - xmm3.
+  unsigned NumXmmArgs = 0;
+
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+
+  for (SizeT I = 0, E = Args.size();
+       I < E && NumXmmArgs < Traits::X86_MAX_XMM_ARGS; ++I) {
+    Variable *Arg = Args[I];
+    Type Ty = Arg->getType();
+    if (!isVectorType(Ty))
+      continue;
+    // Replace Arg in the argument list with the home register.  Then
+    // generate an instruction in the prolog to copy the home register
+    // to the assigned location of Arg.
+    int32_t RegNum = RegX8632::Reg_xmm0 + NumXmmArgs;
+    ++NumXmmArgs;
+    Variable *RegisterArg = Func->makeVariable(Ty);
+    if (ALLOW_DUMP)
+      RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg();
+    Arg->setIsArg(false);
+
+    Args[I] = RegisterArg;
+    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+  }
+}
+
+// Helper function for addProlog().
+//
+// This assumes Arg is an argument passed on the stack.  This sets the
+// frame offset for Arg and updates InArgsSizeBytes according to Arg's
+// width.  For an I64 arg that has been split into Lo and Hi components,
+// it calls itself recursively on the components, taking care to handle
+// Lo first because of the little-endian architecture.  Lastly, this
+// function generates an instruction to copy Arg into its assigned
+// register if applicable.
+template <class Machine>
+void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg,
+                                                    Variable *FramePtr,
+                                                    size_t BasicFrameOffset,
+                                                    size_t &InArgsSizeBytes) {
+  Variable *Lo = Arg->getLo();
+  Variable *Hi = Arg->getHi();
+  Type Ty = Arg->getType();
+  if (Lo && Hi && Ty == IceType_i64) {
+    assert(Lo->getType() != IceType_i64); // don't want infinite recursion
+    assert(Hi->getType() != IceType_i64); // don't want infinite recursion
+    finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    return;
+  }
+  if (isVectorType(Ty)) {
+    InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
+  }
+  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
+  if (Arg->hasReg()) {
+    assert(Ty != IceType_i64);
+    OperandX8632Mem *Mem = OperandX8632Mem::create(
+        Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
+    if (isVectorType(Arg->getType())) {
+      _movp(Arg, Mem);
+    } else {
+      _mov(Arg, Mem);
+    }
+    // This argument-copying instruction uses an explicit
+    // OperandX8632Mem operand instead of a Variable, so its
+    // fill-from-stack operation has to be tracked separately for
+    // statistics.
+    Ctx->statsUpdateFills();
+  }
+}
+
+template <class Machine> Type TargetX86Base<Machine>::stackSlotType() {
+  return IceType_i32;
+}
+
+template <class Machine> void TargetX86Base<Machine>::addProlog(CfgNode *Node) {
+  // Stack frame layout:
+  //
+  // +------------------------+
+  // | 1. return address      |
+  // +------------------------+
+  // | 2. preserved registers |
+  // +------------------------+
+  // | 3. padding             |
+  // +------------------------+
+  // | 4. global spill area   |
+  // +------------------------+
+  // | 5. padding             |
+  // +------------------------+
+  // | 6. local spill area    |
+  // +------------------------+
+  // | 7. padding             |
+  // +------------------------+
+  // | 8. allocas             |
+  // +------------------------+
+  //
+  // The following variables record the size in bytes of the given areas:
+  //  * X86_RET_IP_SIZE_BYTES:  area 1
+  //  * PreservedRegsSizeBytes: area 2
+  //  * SpillAreaPaddingBytes:  area 3
+  //  * GlobalsSize:            area 4
+  //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
+  //  * LocalsSpillAreaSize:    area 6
+  //  * SpillAreaSizeBytes:     areas 3 - 7
+
+  // Determine stack frame offsets for each Variable without a
+  // register assignment.  This can be done as one variable per stack
+  // slot.  Or, do coalescing by running the register allocator again
+  // with an infinite set of registers (as a side effect, this gives
+  // variables a second chance at physical register assignment).
+  //
+  // A middle ground approach is to leverage sparsity and allocate one
+  // block of space on the frame for globals (variables with
+  // multi-block lifetime), and one block to share for locals
+  // (single-block lifetime).
+
+  Context.init(Node);
+  Context.setInsertPoint(Context.getCur());
+
+  llvm::SmallBitVector CalleeSaves =
+      getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
+  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
+  size_t GlobalsSize = 0;
+  // If there is a separate locals area, this represents that area.
+  // Otherwise it counts any variable not counted by GlobalsSize.
+  SpillAreaSizeBytes = 0;
+  // If there is a separate locals area, this specifies the alignment
+  // for it.
+  uint32_t LocalsSlotsAlignmentBytes = 0;
+  // The entire spill locations area gets aligned to largest natural
+  // alignment of the variables that have a spill slot.
+  uint32_t SpillAreaAlignmentBytes = 0;
+  // A spill slot linked to a variable with a stack slot should reuse
+  // that stack slot.
+  std::function<bool(Variable *)> TargetVarHook =
+      [&VariablesLinkedToSpillSlots](Variable *Var) {
+        if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
+          assert(Var->getWeight().isZero());
+          if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
+            VariablesLinkedToSpillSlots.push_back(Var);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
+  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
+                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
+                        &LocalsSlotsAlignmentBytes, TargetVarHook);
+  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
+  SpillAreaSizeBytes += GlobalsSize;
+
+  // Add push instructions for preserved registers.
+  uint32_t NumCallee = 0;
+  size_t PreservedRegsSizeBytes = 0;
+  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
+    if (CalleeSaves[i] && RegsUsed[i]) {
+      ++NumCallee;
+      PreservedRegsSizeBytes += 4;
+      _push(getPhysicalRegister(i));
+    }
+  }
+  Ctx->statsUpdateRegistersSaved(NumCallee);
+
+  // Generate "push ebp; mov ebp, esp"
+  if (IsEbpBasedFrame) {
+    assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
+               .count() == 0);
+    PreservedRegsSizeBytes += 4;
+    Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
+    Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
+    _push(ebp);
+    _mov(ebp, esp);
+    // Keep ebp live for late-stage liveness analysis
+    // (e.g. asm-verbose mode).
+    Context.insert(InstFakeUse::create(Func, ebp));
+  }
+
+  // Align the variables area. SpillAreaPaddingBytes is the size of
+  // the region after the preserved registers and before the spill areas.
+  // LocalsSlotsPaddingBytes is the amount of padding between the globals
+  // and locals area if they are separate.
+  assert(SpillAreaAlignmentBytes <= Traits::X86_STACK_ALIGNMENT_BYTES);
+  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
+  uint32_t SpillAreaPaddingBytes = 0;
+  uint32_t LocalsSlotsPaddingBytes = 0;
+  alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
+                       SpillAreaAlignmentBytes, GlobalsSize,
+                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
+                       &LocalsSlotsPaddingBytes);
+  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
+  uint32_t GlobalsAndSubsequentPaddingSize =
+      GlobalsSize + LocalsSlotsPaddingBytes;
+
+  // Align esp if necessary.
+  if (NeedsStackAlignment) {
+    uint32_t StackOffset =
+        Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+    uint32_t StackSize =
+        Traits::applyStackAlignment(StackOffset + SpillAreaSizeBytes);
+    SpillAreaSizeBytes = StackSize - StackOffset;
+  }
+
+  // Generate "sub esp, SpillAreaSizeBytes"
+  if (SpillAreaSizeBytes)
+    _sub(getPhysicalRegister(RegX8632::Reg_esp),
+         Ctx->getConstantInt32(SpillAreaSizeBytes));
+  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+
+  resetStackAdjustment();
+
+  // Fill in stack offsets for stack args, and copy args into registers
+  // for those that were register-allocated.  Args are pushed right to
+  // left, so Arg[0] is closest to the stack/frame pointer.
+  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
+  size_t BasicFrameOffset =
+      PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
+  if (!IsEbpBasedFrame)
+    BasicFrameOffset += SpillAreaSizeBytes;
+
+  const VarList &Args = Func->getArgs();
+  size_t InArgsSizeBytes = 0;
+  unsigned NumXmmArgs = 0;
+  for (Variable *Arg : Args) {
+    // Skip arguments passed in registers.
+    if (isVectorType(Arg->getType()) && NumXmmArgs < Traits::X86_MAX_XMM_ARGS) {
+      ++NumXmmArgs;
+      continue;
+    }
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+  }
+
+  // Fill in stack offsets for locals.
+  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
+                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
+                      IsEbpBasedFrame);
+  // Assign stack offsets to variables that have been linked to spilled
+  // variables.
+  for (Variable *Var : VariablesLinkedToSpillSlots) {
+    Variable *Linked = (llvm::cast<SpillVariable>(Var))->getLinkedTo();
+    Var->setStackOffset(Linked->getStackOffset());
+  }
+  this->HasComputedFrame = true;
+
+  if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+
+    Str << "Stack layout:\n";
+    uint32_t EspAdjustmentPaddingSize =
+        SpillAreaSizeBytes - LocalsSpillAreaSize -
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
+        << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
+        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
+        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
+        << " globals spill area = " << GlobalsSize << " bytes\n"
+        << " globals-locals spill areas intermediate padding = "
+        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
+        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
+        << " esp alignment padding = " << EspAdjustmentPaddingSize
+        << " bytes\n";
+
+    Str << "Stack details:\n"
+        << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
+        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
+        << " bytes\n"
+        << " is ebp based = " << IsEbpBasedFrame << "\n";
+  }
+}
+
+template <class Machine> void TargetX86Base<Machine>::addEpilog(CfgNode *Node) {
+  InstList &Insts = Node->getInsts();
+  InstList::reverse_iterator RI, E;
+  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
+    if (llvm::isa<InstX8632Ret>(*RI))
+      break;
+  }
+  if (RI == E)
+    return;
+
+  // Convert the reverse_iterator position into its corresponding
+  // (forward) iterator position.
+  InstList::iterator InsertPoint = RI.base();
+  --InsertPoint;
+  Context.init(Node);
+  Context.setInsertPoint(InsertPoint);
+
+  Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
+  if (IsEbpBasedFrame) {
+    Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
+    // For late-stage liveness analysis (e.g. asm-verbose mode),
+    // adding a fake use of esp before the assignment of esp=ebp keeps
+    // previous esp adjustments from being dead-code eliminated.
+    Context.insert(InstFakeUse::create(Func, esp));
+    _mov(esp, ebp);
+    _pop(ebp);
+  } else {
+    // add esp, SpillAreaSizeBytes
+    if (SpillAreaSizeBytes)
+      _add(esp, Ctx->getConstantInt32(SpillAreaSizeBytes));
+  }
+
+  // Add pop instructions for preserved registers.
+  llvm::SmallBitVector CalleeSaves =
+      getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
+    SizeT j = CalleeSaves.size() - i - 1;
+    if (j == RegX8632::Reg_ebp && IsEbpBasedFrame)
+      continue;
+    if (CalleeSaves[j] && RegsUsed[j]) {
+      _pop(getPhysicalRegister(j));
+    }
+  }
+
+  if (!Ctx->getFlags().getUseSandboxing())
+    return;
+  // Change the original ret instruction into a sandboxed return sequence.
+  // t:ecx = pop
+  // bundle_lock
+  // and t, ~31
+  // jmp *t
+  // bundle_unlock
+  // FakeUse <original_ret_operand>
+  const SizeT BundleSize =
+      1 << Func->template getAssembler<>()->getBundleAlignLog2Bytes();
+  Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
+  _pop(T_ecx);
+  _bundle_lock();
+  _and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
+  _jmp(T_ecx);
+  _bundle_unlock();
+  if (RI->getSrcSize()) {
+    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
+    Context.insert(InstFakeUse::create(Func, RetValue));
+  }
+  RI->setDeleted();
+}
+
+template <class Machine> void TargetX86Base<Machine>::split64(Variable *Var) {
+  switch (Var->getType()) {
+  default:
+    return;
+  case IceType_i64:
+  // TODO: Only consider F64 if we need to push each half when
+  // passing as an argument to a function call.  Note that each half
+  // is still typed as I32.
+  case IceType_f64:
+    break;
+  }
+  Variable *Lo = Var->getLo();
+  Variable *Hi = Var->getHi();
+  if (Lo) {
+    assert(Hi);
+    return;
+  }
+  assert(Hi == nullptr);
+  Lo = Func->makeVariable(IceType_i32);
+  Hi = Func->makeVariable(IceType_i32);
+  if (ALLOW_DUMP) {
+    Lo->setName(Func, Var->getName(Func) + "__lo");
+    Hi->setName(Func, Var->getName(Func) + "__hi");
+  }
+  Var->setLoHi(Lo, Hi);
+  if (Var->getIsArg()) {
+    Lo->setIsArg();
+    Hi->setIsArg();
+  }
+}
+
+template <class Machine>
+Operand *TargetX86Base<Machine>::loOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
+    split64(Var);
+    return Var->getLo();
+  }
+  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    ConstantInteger32 *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
+    return legalize(ConstInt);
+  }
+  if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
+    OperandX8632Mem *MemOperand = OperandX8632Mem::create(
+        Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister());
+    // Test if we should randomize or pool the offset, if so randomize it or
+    // pool it then create mem operand with the blinded/pooled constant.
+    // Otherwise, return the mem operand as ordinary mem operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+template <class Machine>
+Operand *TargetX86Base<Machine>::hiOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
+    return Operand;
+  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
+    split64(Var);
+    return Var->getHi();
+  }
+  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    ConstantInteger32 *ConstInt = llvm::dyn_cast<ConstantInteger32>(
+        Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
+    // check if we need to blind/pool the constant
+    return legalize(ConstInt);
+  }
+  if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
+    Constant *Offset = Mem->getOffset();
+    if (Offset == nullptr) {
+      Offset = Ctx->getConstantInt32(4);
+    } else if (ConstantInteger32 *IntOffset =
+                   llvm::dyn_cast<ConstantInteger32>(Offset)) {
+      Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
+    } else if (ConstantRelocatable *SymOffset =
+                   llvm::dyn_cast<ConstantRelocatable>(Offset)) {
+      assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
+      Offset =
+          Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName(),
+                              SymOffset->getSuppressMangling());
+    }
+    OperandX8632Mem *MemOperand = OperandX8632Mem::create(
+        Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
+        Mem->getShift(), Mem->getSegmentRegister());
+    // Test if the Offset is an eligible i32 constants for randomization and
+    // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
+    // operand.
+    return legalize(MemOperand);
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+template <class Machine>
+llvm::SmallBitVector
+TargetX86Base<Machine>::getRegisterSet(RegSetMask Include,
+                                       RegSetMask Exclude) const {
+  llvm::SmallBitVector Registers(RegX8632::Reg_NUM);
+
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  if (scratch && (Include & RegSet_CallerSave))                                \
+    Registers[RegX8632::val] = true;                                           \
+  if (preserved && (Include & RegSet_CalleeSave))                              \
+    Registers[RegX8632::val] = true;                                           \
+  if (stackptr && (Include & RegSet_StackPointer))                             \
+    Registers[RegX8632::val] = true;                                           \
+  if (frameptr && (Include & RegSet_FramePointer))                             \
+    Registers[RegX8632::val] = true;                                           \
+  if (scratch && (Exclude & RegSet_CallerSave))                                \
+    Registers[RegX8632::val] = false;                                          \
+  if (preserved && (Exclude & RegSet_CalleeSave))                              \
+    Registers[RegX8632::val] = false;                                          \
+  if (stackptr && (Exclude & RegSet_StackPointer))                             \
+    Registers[RegX8632::val] = false;                                          \
+  if (frameptr && (Exclude & RegSet_FramePointer))                             \
+    Registers[RegX8632::val] = false;
+
+  REGX8632_TABLE
+
+#undef X
+
+  return Registers;
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
+  IsEbpBasedFrame = true;
+  // Conservatively require the stack to be aligned.  Some stack
+  // adjustment operations implemented below assume that the stack is
+  // aligned before the alloca.  All the alloca code ensures that the
+  // stack alignment is preserved after the alloca.  The stack alignment
+  // restriction can be relaxed in some cases.
+  NeedsStackAlignment = true;
+
+  // TODO(stichnot): minimize the number of adjustments of esp, etc.
+  Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
+  Operand *TotalSize = legalize(Inst->getSizeInBytes());
+  Variable *Dest = Inst->getDest();
+  uint32_t AlignmentParam = Inst->getAlignInBytes();
+  // For default align=0, set it to the real value 1, to avoid any
+  // bit-manipulation problems below.
+  AlignmentParam = std::max(AlignmentParam, 1u);
+
+  // LLVM enforces power of 2 alignment.
+  assert(llvm::isPowerOf2_32(AlignmentParam));
+  assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
+
+  uint32_t Alignment =
+      std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
+  if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
+    _and(esp, Ctx->getConstantInt32(-Alignment));
+  }
+  if (const auto *ConstantTotalSize =
+          llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
+    uint32_t Value = ConstantTotalSize->getValue();
+    Value = Utils::applyAlignment(Value, Alignment);
+    _sub(esp, Ctx->getConstantInt32(Value));
+  } else {
+    // Non-constant sizes need to be adjusted to the next highest
+    // multiple of the required alignment at runtime.
+    Variable *T = makeReg(IceType_i32);
+    _mov(T, TotalSize);
+    _add(T, Ctx->getConstantInt32(Alignment - 1));
+    _and(T, Ctx->getConstantInt32(-Alignment));
+    _sub(esp, T);
+  }
+  _mov(Dest, esp);
+}
+
+// Strength-reduce scalar integer multiplication by a constant (for
+// i32 or narrower) for certain constants.  The lea instruction can be
+// used to multiply by 3, 5, or 9, and the lsh instruction can be used
+// to multiply by powers of 2.  These can be combined such that
+// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
+// combined with left-shifting by 2.
+template <class Machine>
+bool TargetX86Base<Machine>::optimizeScalarMul(Variable *Dest, Operand *Src0,
+                                               int32_t Src1) {
+  // Disable this optimization for Om1 and O0, just to keep things
+  // simple there.
+  if (Ctx->getFlags().getOptLevel() < Opt_1)
+    return false;
+  Type Ty = Dest->getType();
+  Variable *T = nullptr;
+  if (Src1 == -1) {
+    _mov(T, Src0);
+    _neg(T);
+    _mov(Dest, T);
+    return true;
+  }
+  if (Src1 == 0) {
+    _mov(Dest, Ctx->getConstantZero(Ty));
+    return true;
+  }
+  if (Src1 == 1) {
+    _mov(T, Src0);
+    _mov(Dest, T);
+    return true;
+  }
+  // Don't bother with the edge case where Src1 == MININT.
+  if (Src1 == -Src1)
+    return false;
+  const bool Src1IsNegative = Src1 < 0;
+  if (Src1IsNegative)
+    Src1 = -Src1;
+  uint32_t Count9 = 0;
+  uint32_t Count5 = 0;
+  uint32_t Count3 = 0;
+  uint32_t Count2 = 0;
+  uint32_t CountOps = 0;
+  while (Src1 > 1) {
+    if (Src1 % 9 == 0) {
+      ++CountOps;
+      ++Count9;
+      Src1 /= 9;
+    } else if (Src1 % 5 == 0) {
+      ++CountOps;
+      ++Count5;
+      Src1 /= 5;
+    } else if (Src1 % 3 == 0) {
+      ++CountOps;
+      ++Count3;
+      Src1 /= 3;
+    } else if (Src1 % 2 == 0) {
+      if (Count2 == 0)
+        ++CountOps;
+      ++Count2;
+      Src1 /= 2;
+    } else {
+      return false;
+    }
+  }
+  // Lea optimization only works for i16 and i32 types, not i8.
+  if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9))
+    return false;
+  // Limit the number of lea/shl operations for a single multiply, to
+  // a somewhat arbitrary choice of 3.
+  const uint32_t MaxOpsForOptimizedMul = 3;
+  if (CountOps > MaxOpsForOptimizedMul)
+    return false;
+  _mov(T, Src0);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  for (uint32_t i = 0; i < Count9; ++i) {
+    const uint16_t Shift = 3; // log2(9-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  for (uint32_t i = 0; i < Count5; ++i) {
+    const uint16_t Shift = 2; // log2(5-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  for (uint32_t i = 0; i < Count3; ++i) {
+    const uint16_t Shift = 1; // log2(3-1)
+    _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+    _set_dest_nonkillable();
+  }
+  if (Count2) {
+    _shl(T, Ctx->getConstantInt(Ty, Count2));
+  }
+  if (Src1IsNegative)
+    _neg(T);
+  _mov(Dest, T);
+  return true;
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
+  Variable *Dest = Inst->getDest();
+  Operand *Src0 = legalize(Inst->getSrc(0));
+  Operand *Src1 = legalize(Inst->getSrc(1));
+  if (Inst->isCommutative()) {
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+      std::swap(Src0, Src1);
+    if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+      std::swap(Src0, Src1);
+  }
+  if (Dest->getType() == IceType_i64) {
+    // These helper-call-involved instructions are lowered in this
+    // separate switch. This is because loOperand() and hiOperand()
+    // may insert redundant instructions for constant blinding and
+    // pooling. Such redundant instructions will fail liveness analysis
+    // under -Om1 setting. And, actually these arguments do not need
+    // to be processed with loOperand() and hiOperand() to be used.
+    switch (Inst->getOp()) {
+    case InstArithmetic::Udiv: {
+      const SizeT MaxSrcs = 2;
+      InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      Call->addArg(Inst->getSrc(1));
+      lowerCall(Call);
+      return;
+    }
+    case InstArithmetic::Sdiv: {
+      const SizeT MaxSrcs = 2;
+      InstCall *Call = makeHelperCall(H_sdiv_i64, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      Call->addArg(Inst->getSrc(1));
+      lowerCall(Call);
+      return;
+    }
+    case InstArithmetic::Urem: {
+      const SizeT MaxSrcs = 2;
+      InstCall *Call = makeHelperCall(H_urem_i64, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      Call->addArg(Inst->getSrc(1));
+      lowerCall(Call);
+      return;
+    }
+    case InstArithmetic::Srem: {
+      const SizeT MaxSrcs = 2;
+      InstCall *Call = makeHelperCall(H_srem_i64, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      Call->addArg(Inst->getSrc(1));
+      lowerCall(Call);
+      return;
+    }
+    default:
+      break;
+    }
+
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    Operand *Src1Lo = loOperand(Src1);
+    Operand *Src1Hi = hiOperand(Src1);
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    switch (Inst->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add:
+      _mov(T_Lo, Src0Lo);
+      _add(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _adc(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::And:
+      _mov(T_Lo, Src0Lo);
+      _and(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _and(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Or:
+      _mov(T_Lo, Src0Lo);
+      _or(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _or(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Xor:
+      _mov(T_Lo, Src0Lo);
+      _xor(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _xor(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Sub:
+      _mov(T_Lo, Src0Lo);
+      _sub(T_Lo, Src1Lo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, Src0Hi);
+      _sbb(T_Hi, Src1Hi);
+      _mov(DestHi, T_Hi);
+      break;
+    case InstArithmetic::Mul: {
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
+      Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
+      // gcc does the following:
+      // a=b*c ==>
+      //   t1 = b.hi; t1 *=(imul) c.lo
+      //   t2 = c.hi; t2 *=(imul) b.lo
+      //   t3:eax = b.lo
+      //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
+      //   a.lo = t4.lo
+      //   t4.hi += t1
+      //   t4.hi += t2
+      //   a.hi = t4.hi
+      // The mul instruction cannot take an immediate operand.
+      Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
+      _mov(T_1, Src0Hi);
+      _imul(T_1, Src1Lo);
+      _mov(T_2, Src1Hi);
+      _imul(T_2, Src0Lo);
+      _mov(T_3, Src0Lo, RegX8632::Reg_eax);
+      _mul(T_4Lo, T_3, Src1Lo);
+      // The mul instruction produces two dest variables, edx:eax.  We
+      // create a fake definition of edx to account for this.
+      Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo));
+      _mov(DestLo, T_4Lo);
+      _add(T_4Hi, T_1);
+      _add(T_4Hi, T_2);
+      _mov(DestHi, T_4Hi);
+    } break;
+    case InstArithmetic::Shl: {
+      // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
+      // gcc does the following:
+      // a=b<<c ==>
+      //   t1:ecx = c.lo & 0xff
+      //   t2 = b.lo
+      //   t3 = b.hi
+      //   t3 = shld t3, t2, t1
+      //   t2 = shl t2, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t3)
+      //   t3 = t2
+      //   t2 = 0
+      // L1:
+      //   a.lo = t2
+      //   a.hi = t3
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Constant *BitTest = Ctx->getConstantInt32(0x20);
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
+      _mov(T_2, Src0Lo);
+      _mov(T_3, Src0Hi);
+      _shld(T_3, T_2, T_1);
+      _shl(T_2, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the
+      // intra-block control flow, so we need the _mov_nonkillable
+      // variant to avoid liveness problems.
+      _mov_nonkillable(T_3, T_2);
+      _mov_nonkillable(T_2, Zero);
+      Context.insert(Label);
+      _mov(DestLo, T_2);
+      _mov(DestHi, T_3);
+    } break;
+    case InstArithmetic::Lshr: {
+      // a=b>>c (unsigned) ==>
+      //   t1:ecx = c.lo & 0xff
+      //   t2 = b.lo
+      //   t3 = b.hi
+      //   t2 = shrd t2, t3, t1
+      //   t3 = shr t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = 0
+      // L1:
+      //   a.lo = t2
+      //   a.hi = t3
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Constant *BitTest = Ctx->getConstantInt32(0x20);
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
+      _mov(T_2, Src0Lo);
+      _mov(T_3, Src0Hi);
+      _shrd(T_2, T_3, T_1);
+      _shr(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the
+      // intra-block control flow, so we need the _mov_nonkillable
+      // variant to avoid liveness problems.
+      _mov_nonkillable(T_2, T_3);
+      _mov_nonkillable(T_3, Zero);
+      Context.insert(Label);
+      _mov(DestLo, T_2);
+      _mov(DestHi, T_3);
+    } break;
+    case InstArithmetic::Ashr: {
+      // a=b>>c (signed) ==>
+      //   t1:ecx = c.lo & 0xff
+      //   t2 = b.lo
+      //   t3 = b.hi
+      //   t2 = shrd t2, t3, t1
+      //   t3 = sar t3, t1
+      //   test t1, 0x20
+      //   je L1
+      //   use(t2)
+      //   t2 = t3
+      //   t3 = sar t3, 0x1f
+      // L1:
+      //   a.lo = t2
+      //   a.hi = t3
+      Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
+      Constant *BitTest = Ctx->getConstantInt32(0x20);
+      Constant *SignExtend = Ctx->getConstantInt32(0x1f);
+      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
+      _mov(T_2, Src0Lo);
+      _mov(T_3, Src0Hi);
+      _shrd(T_2, T_3, T_1);
+      _sar(T_3, T_1);
+      _test(T_1, BitTest);
+      _br(CondX86::Br_e, Label);
+      // T_2 and T_3 are being assigned again because of the
+      // intra-block control flow, so T_2 needs the _mov_nonkillable
+      // variant to avoid liveness problems.  T_3 doesn't need special
+      // treatment because it is reassigned via _sar instead of _mov.
+      _mov_nonkillable(T_2, T_3);
+      _sar(T_3, SignExtend);
+      Context.insert(Label);
+      _mov(DestLo, T_2);
+      _mov(DestHi, T_3);
+    } break;
+    case InstArithmetic::Fadd:
+    case InstArithmetic::Fsub:
+    case InstArithmetic::Fmul:
+    case InstArithmetic::Fdiv:
+    case InstArithmetic::Frem:
+      llvm_unreachable("FP instruction with i64 type");
+      break;
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Srem:
+      llvm_unreachable("Call-helper-involved instruction for i64 type \
+                       should have already been handled before");
+      break;
+    }
+    return;
+  }
+  if (isVectorType(Dest->getType())) {
+    // TODO: Trap on integer divide and integer modulo by zero.
+    // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
+    if (llvm::isa<OperandX8632Mem>(Src1))
+      Src1 = legalizeToVar(Src1);
+    switch (Inst->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _padd(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::And: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _pand(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Or: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _por(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Xor: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _pxor(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Sub: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _psub(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Mul: {
+      bool TypesAreValidForPmull =
+          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
+      bool InstructionSetIsValidForPmull =
+          Dest->getType() == IceType_v8i16 || InstructionSet >= Machine::SSE4_1;
+      if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
+        Variable *T = makeReg(Dest->getType());
+        _movp(T, Src0);
+        _pmull(T, Src1);
+        _movp(Dest, T);
+      } else if (Dest->getType() == IceType_v4i32) {
+        // Lowering sequence:
+        // Note: The mask arguments have index 0 on the left.
+        //
+        // movups  T1, Src0
+        // pshufd  T2, Src0, {1,0,3,0}
+        // pshufd  T3, Src1, {1,0,3,0}
+        // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
+        // pmuludq T1, Src1
+        // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
+        // pmuludq T2, T3
+        // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
+        // shufps  T1, T2, {0,2,0,2}
+        // pshufd  T4, T1, {0,2,1,3}
+        // movups  Dest, T4
+
+        // Mask that directs pshufd to create a vector with entries
+        // Src[1, 0, 3, 0]
+        const unsigned Constant1030 = 0x31;
+        Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
+        // Mask that directs shufps to create a vector with entries
+        // Dest[0, 2], Src[0, 2]
+        const unsigned Mask0202 = 0x88;
+        // Mask that directs pshufd to create a vector with entries
+        // Src[0, 2, 1, 3]
+        const unsigned Mask0213 = 0xd8;
+        Variable *T1 = makeReg(IceType_v4i32);
+        Variable *T2 = makeReg(IceType_v4i32);
+        Variable *T3 = makeReg(IceType_v4i32);
+        Variable *T4 = makeReg(IceType_v4i32);
+        _movp(T1, Src0);
+        _pshufd(T2, Src0, Mask1030);
+        _pshufd(T3, Src1, Mask1030);
+        _pmuludq(T1, Src1);
+        _pmuludq(T2, T3);
+        _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
+        _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
+        _movp(Dest, T4);
+      } else {
+        assert(Dest->getType() == IceType_v16i8);
+        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      }
+    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
+    case InstArithmetic::Fadd: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _addps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fsub: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _subps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fmul: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _mulps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Fdiv: {
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0);
+      _divps(T, Src1);
+      _movp(Dest, T);
+    } break;
+    case InstArithmetic::Frem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
+    }
+    return;
+  }
+  Variable *T_edx = nullptr;
+  Variable *T = nullptr;
+  switch (Inst->getOp()) {
+  case InstArithmetic::_num:
+    llvm_unreachable("Unknown arithmetic operator");
+    break;
+  case InstArithmetic::Add:
+    _mov(T, Src0);
+    _add(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::And:
+    _mov(T, Src0);
+    _and(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Or:
+    _mov(T, Src0);
+    _or(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Xor:
+    _mov(T, Src0);
+    _xor(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Sub:
+    _mov(T, Src0);
+    _sub(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Mul:
+    if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+      if (optimizeScalarMul(Dest, Src0, C->getValue()))
+        return;
+    }
+    // The 8-bit version of imul only allows the form "imul r/m8"
+    // where T must be in eax.
+    if (isByteSizedArithType(Dest->getType())) {
+      _mov(T, Src0, RegX8632::Reg_eax);
+      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    } else {
+      _mov(T, Src0);
+    }
+    _imul(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Shl:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _shl(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Lshr:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _shr(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Ashr:
+    _mov(T, Src0);
+    if (!llvm::isa<Constant>(Src1))
+      Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+    _sar(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Udiv:
+    // div and idiv are the few arithmetic operators that do not allow
+    // immediates as the operand.
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = nullptr;
+      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_ah, Zero, RegX8632::Reg_ah);
+      _div(T, Src1, T_ah);
+      _mov(Dest, T);
+    } else {
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_edx, Zero, RegX8632::Reg_edx);
+      _div(T, Src1, T_edx);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Sdiv:
+    // TODO(stichnot): Enable this after doing better performance
+    // and cross testing.
+    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+      // Optimize division by constant power of 2, but not for Om1
+      // or O0, just to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        int32_t Divisor = C->getValue();
+        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          Type Ty = Dest->getType();
+          // LLVM does the following for dest=src/(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   sar t,log
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          _mov(T, Src0);
+          // If for some reason we are dividing by 1, just treat it
+          // like an assignment.
+          if (LogDiv > 0) {
+            // The initial sar is unnecessary when dividing by 2.
+            if (LogDiv > 1)
+              _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+            _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+            _add(T, Src0);
+            _sar(T, Ctx->getConstantInt(Ty, LogDiv));
+          }
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T, T);
+      _idiv(T, Src1, T);
+      _mov(Dest, T);
+    } else {
+      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T_edx, T);
+      _idiv(T, Src1, T_edx);
+      _mov(Dest, T);
+    }
+    break;
+  case InstArithmetic::Urem:
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = nullptr;
+      Constant *Zero = Ctx->getConstantZero(IceType_i8);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _mov(T_ah, Zero, RegX8632::Reg_ah);
+      _div(T_ah, Src1, T);
+      _mov(Dest, T_ah);
+    } else {
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(T_edx, Zero, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _div(T_edx, Src1, T);
+      _mov(Dest, T_edx);
+    }
+    break;
+  case InstArithmetic::Srem:
+    // TODO(stichnot): Enable this after doing better performance
+    // and cross testing.
+    if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+      // Optimize mod by constant power of 2, but not for Om1 or O0,
+      // just to keep things simple there.
+      if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+        int32_t Divisor = C->getValue();
+        uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+        if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+          uint32_t LogDiv = llvm::Log2_32(UDivisor);
+          Type Ty = Dest->getType();
+          // LLVM does the following for dest=src%(1<<log):
+          //   t=src
+          //   sar t,typewidth-1 // -1 if src is negative, 0 if not
+          //   shr t,typewidth-log
+          //   add t,src
+          //   and t, -(1<<log)
+          //   sub t,src
+          //   neg t
+          //   dest=t
+          uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
+          // If for some reason we are dividing by 1, just assign 0.
+          if (LogDiv == 0) {
+            _mov(Dest, Ctx->getConstantZero(Ty));
+            return;
+          }
+          _mov(T, Src0);
+          // The initial sar is unnecessary when dividing by 2.
+          if (LogDiv > 1)
+            _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+          _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+          _add(T, Src0);
+          _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
+          _sub(T, Src0);
+          _neg(T);
+          _mov(Dest, T);
+          return;
+        }
+      }
+    }
+    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+    if (isByteSizedArithType(Dest->getType())) {
+      Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T, T);
+      Context.insert(InstFakeDef::create(Func, T_ah));
+      _idiv(T_ah, Src1, T);
+      _mov(Dest, T_ah);
+    } else {
+      T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+      _mov(T, Src0, RegX8632::Reg_eax);
+      _cbwdq(T_edx, T);
+      _idiv(T_edx, Src1, T);
+      _mov(Dest, T_edx);
+    }
+    break;
+  case InstArithmetic::Fadd:
+    _mov(T, Src0);
+    _addss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fsub:
+    _mov(T, Src0);
+    _subss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fmul:
+    _mov(T, Src0);
+    _mulss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Fdiv:
+    _mov(T, Src0);
+    _divss(T, Src1);
+    _mov(Dest, T);
+    break;
+  case InstArithmetic::Frem: {
+    const SizeT MaxSrcs = 2;
+    Type Ty = Dest->getType();
+    InstCall *Call = makeHelperCall(
+        isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
+    Call->addArg(Src0);
+    Call->addArg(Src1);
+    return lowerCall(Call);
+  }
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerAssign(const InstAssign *Inst) {
+  Variable *Dest = Inst->getDest();
+  Operand *Src0 = Inst->getSrc(0);
+  assert(Dest->getType() == Src0->getType());
+  if (Dest->getType() == IceType_i64) {
+    Src0 = legalize(Src0);
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    _mov(T_Lo, Src0Lo);
+    _mov(DestLo, T_Lo);
+    _mov(T_Hi, Src0Hi);
+    _mov(DestHi, T_Hi);
+  } else {
+    Operand *RI;
+    if (Dest->hasReg()) {
+      // If Dest already has a physical register, then legalize the
+      // Src operand into a Variable with the same register
+      // assignment.  This is mostly a workaround for advanced phi
+      // lowering's ad-hoc register allocation which assumes no
+      // register allocation is needed when at least one of the
+      // operands is non-memory.
+
+      // If we have a physical register for the dest variable, we can
+      // enable our constant blinding or pooling again. Note this is
+      // only for advancedPhiLowering(), the flag flip should leave
+      // no other side effect.
+      {
+        BoolFlagSaver B(RandomizationPoolingPaused, false);
+        RI = legalize(Src0, Legal_Reg, Dest->getRegNum());
+      }
+    } else {
+      // If Dest could be a stack operand, then RI must be a physical
+      // register or a scalar integer immediate.
+      RI = legalize(Src0, Legal_Reg | Legal_Imm);
+    }
+    if (isVectorType(Dest->getType()))
+      _movp(Dest, RI);
+    else
+      _mov(Dest, RI);
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerBr(const InstBr *Inst) {
+  if (Inst->isUnconditional()) {
+    _br(Inst->getTargetUnconditional());
+    return;
+  }
+  Operand *Cond = Inst->getCondition();
+
+  // Handle folding opportunities.
+  if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32: {
+      // TODO(stichnot): Refactor similarities between this block and
+      // the corresponding code in lowerIcmp().
+      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
+      Operand *Src0 = Producer->getSrc(0);
+      Operand *Src1 = legalize(Producer->getSrc(1));
+      Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+      _cmp(Src0RM, Src1);
+      _br(Traits::getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
+          Inst->getTargetFalse());
+      return;
+    }
+    }
+  }
+
+  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(Src0, Zero);
+  _br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerCall(const InstCall *Instr) {
+  // x86-32 calling convention:
+  //
+  // * At the point before the call, the stack must be aligned to 16
+  // bytes.
+  //
+  // * The first four arguments of vector type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // placed in registers xmm0 - xmm3.
+  //
+  // * Other arguments are pushed onto the stack in right-to-left order,
+  // such that the left-most argument ends up on the top of the stack at
+  // the lowest memory address.
+  //
+  // * Stack arguments of vector type are aligned to start at the next
+  // highest multiple of 16 bytes.  Other stack arguments are aligned to
+  // 4 bytes.
+  //
+  // This intends to match the section "IA-32 Function Calling
+  // Convention" of the document "OS X ABI Function Call Guide" by
+  // Apple.
+  NeedsStackAlignment = true;
+
+  typedef std::vector<Operand *> OperandList;
+  OperandList XmmArgs;
+  OperandList StackArgs, StackArgLocations;
+  uint32_t ParameterAreaSizeBytes = 0;
+
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    Type Ty = Arg->getType();
+    // The PNaCl ABI requires the width of arguments to be at least 32 bits.
+    assert(typeWidthInBytes(Ty) >= 4);
+    if (isVectorType(Ty) && XmmArgs.size() < Traits::X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else {
+      StackArgs.push_back(Arg);
+      if (isVectorType(Arg->getType())) {
+        ParameterAreaSizeBytes =
+            Traits::applyStackAlignment(ParameterAreaSizeBytes);
+      }
+      Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
+      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
+      StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));
+      ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
+    }
+  }
+
+  // Adjust the parameter area so that the stack is aligned.  It is
+  // assumed that the stack is already aligned at the start of the
+  // calling sequence.
+  ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
+
+  // Subtract the appropriate amount for the argument area.  This also
+  // takes care of setting the stack adjustment during emission.
+  //
+  // TODO: If for some reason the call instruction gets dead-code
+  // eliminated after lowering, we would need to ensure that the
+  // pre-call and the post-call esp adjustment get eliminated as well.
+  if (ParameterAreaSizeBytes) {
+    _adjust_stack(ParameterAreaSizeBytes);
+  }
+
+  // Copy arguments that are passed on the stack to the appropriate
+  // stack locations.
+  for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
+    lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
+  }
+
+  // Copy arguments to be passed in registers to the appropriate
+  // registers.
+  // TODO: Investigate the impact of lowering arguments passed in
+  // registers after lowering stack arguments as opposed to the other
+  // way around.  Lowering register arguments after stack arguments may
+  // reduce register pressure.  On the other hand, lowering register
+  // arguments first (before stack arguments) may result in more compact
+  // code, as the memory operand displacements may end up being smaller
+  // before any stack adjustment is done.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    Variable *Reg = legalizeToVar(XmmArgs[i], RegX8632::Reg_xmm0 + i);
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
+  // Generate the call instruction.  Assign its result to a temporary
+  // with high register allocation weight.
+  Variable *Dest = Instr->getDest();
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (Dest->getType()) {
+    case IceType_NUM:
+      llvm_unreachable("Invalid Call dest type");
+      break;
+    case IceType_void:
+      break;
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+    case IceType_i32:
+      ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_eax);
+      break;
+    case IceType_i64:
+      ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
+      ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
+      break;
+    case IceType_f32:
+    case IceType_f64:
+      // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
+      // the fstp instruction.
+      break;
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_xmm0);
+      break;
+    }
+  }
+  Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->template getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
+  Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
+  Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+
+  // Add the appropriate offset to esp.  The call instruction takes care
+  // of resetting the stack offset during emission.
+  if (ParameterAreaSizeBytes) {
+    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
+    _add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
+  }
+
+  // Insert a register-kill pseudo instruction.
+  Context.insert(InstFakeKill::create(Func, NewCall));
+
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
+    Context.insert(FakeUse);
+  }
+
+  if (!Dest)
+    return;
+
+  // Assign the result of the call to Dest.
+  if (ReturnReg) {
+    if (ReturnRegHi) {
+      assert(Dest->getType() == IceType_i64);
+      split64(Dest);
+      Variable *DestLo = Dest->getLo();
+      Variable *DestHi = Dest->getHi();
+      _mov(DestLo, ReturnReg);
+      _mov(DestHi, ReturnRegHi);
+    } else {
+      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+             isVectorType(Dest->getType()));
+      if (isVectorType(Dest->getType())) {
+        _movp(Dest, ReturnReg);
+      } else {
+        _mov(Dest, ReturnReg);
+      }
+    }
+  } else if (isScalarFloatingType(Dest->getType())) {
+    // Special treatment for an FP function which returns its result in
+    // st(0).
+    // If Dest ends up being a physical xmm register, the fstp emit code
+    // will route st(0) through a temporary stack slot.
+    _fstp(Dest);
+    // Create a fake use of Dest in case it actually isn't used,
+    // because st(0) still needs to be popped.
+    Context.insert(InstFakeUse::create(Func, Dest));
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
+  // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
+  InstCast::OpKind CastKind = Inst->getCastKind();
+  Variable *Dest = Inst->getDest();
+  switch (CastKind) {
+  default:
+    Func->setError("Cast type not supported");
+    return;
+  case InstCast::Sext: {
+    // Src0RM is the source operand legalized to physical register or memory,
+    // but not immediate, since the relevant x86 native instructions don't
+    // allow an immediate operand.  If the operand is an immediate, we could
+    // consider computing the strength-reduced result at translation time,
+    // but we're unlikely to see something like that in the bitcode that
+    // the optimizer wouldn't have already taken care of.
+    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(Dest->getType())) {
+      Type DestTy = Dest->getType();
+      if (DestTy == IceType_v16i8) {
+        // onemask = materialize(1,1,...); dst = (src & onemask) > 0
+        Variable *OneMask = makeVectorOfOnes(Dest->getType());
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _pand(T, OneMask);
+        Variable *Zeros = makeVectorOfZeros(Dest->getType());
+        _pcmpgt(T, Zeros);
+        _movp(Dest, T);
+      } else {
+        // width = width(elty) - 1; dest = (src << width) >> width
+        SizeT ShiftAmount =
+            Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
+            1;
+        Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
+        Variable *T = makeReg(DestTy);
+        _movp(T, Src0RM);
+        _psll(T, ShiftConstant);
+        _psra(T, ShiftConstant);
+        _movp(Dest, T);
+      }
+    } else if (Dest->getType() == IceType_i64) {
+      // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
+      Constant *Shift = Ctx->getConstantInt32(31);
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *T_Lo = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(T_Lo, Src0RM);
+      } else if (Src0RM->getType() == IceType_i1) {
+        _movzx(T_Lo, Src0RM);
+        _shl(T_Lo, Shift);
+        _sar(T_Lo, Shift);
+      } else {
+        _movsx(T_Lo, Src0RM);
+      }
+      _mov(DestLo, T_Lo);
+      Variable *T_Hi = nullptr;
+      _mov(T_Hi, T_Lo);
+      if (Src0RM->getType() != IceType_i1)
+        // For i1, the sar instruction is already done above.
+        _sar(T_Hi, Shift);
+      _mov(DestHi, T_Hi);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t1 = src
+      // shl t1, dst_bitwidth - 1
+      // sar t1, dst_bitwidth - 1
+      // dst = t1
+      size_t DestBits =
+          Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
+      Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
+      Variable *T = makeReg(Dest->getType());
+      if (typeWidthInBytes(Dest->getType()) <=
+          typeWidthInBytes(Src0RM->getType())) {
+        _mov(T, Src0RM);
+      } else {
+        // Widen the source using movsx or movzx.  (It doesn't matter
+        // which one, since the following shl/sar overwrite the bits.)
+        _movzx(T, Src0RM);
+      }
+      _shl(T, ShiftAmount);
+      _sar(T, ShiftAmount);
+      _mov(Dest, T);
+    } else {
+      // t1 = movsx src; dst = t1
+      Variable *T = makeReg(Dest->getType());
+      _movsx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Zext: {
+    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+    if (isVectorType(Dest->getType())) {
+      // onemask = materialize(1,1,...); dest = onemask & src
+      Type DestTy = Dest->getType();
+      Variable *OneMask = makeVectorOfOnes(DestTy);
+      Variable *T = makeReg(DestTy);
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else if (Dest->getType() == IceType_i64) {
+      // t1=movzx src; dst.lo=t1; dst.hi=0
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *Tmp = makeReg(DestLo->getType());
+      if (Src0RM->getType() == IceType_i32) {
+        _mov(Tmp, Src0RM);
+      } else {
+        _movzx(Tmp, Src0RM);
+      }
+      if (Src0RM->getType() == IceType_i1) {
+        Constant *One = Ctx->getConstantInt32(1);
+        _and(Tmp, One);
+      }
+      _mov(DestLo, Tmp);
+      _mov(DestHi, Zero);
+    } else if (Src0RM->getType() == IceType_i1) {
+      // t = Src0RM; t &= 1; Dest = t
+      Constant *One = Ctx->getConstantInt32(1);
+      Type DestTy = Dest->getType();
+      Variable *T;
+      if (DestTy == IceType_i8) {
+        T = makeReg(DestTy);
+        _mov(T, Src0RM);
+      } else {
+        // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
+        T = makeReg(IceType_i32);
+        _movzx(T, Src0RM);
+      }
+      _and(T, One);
+      _mov(Dest, T);
+    } else {
+      // t1 = movzx src; dst = t1
+      Variable *T = makeReg(Dest->getType());
+      _movzx(T, Src0RM);
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Trunc: {
+    if (isVectorType(Dest->getType())) {
+      // onemask = materialize(1,1,...); dst = src & onemask
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      Type Src0Ty = Src0RM->getType();
+      Variable *OneMask = makeVectorOfOnes(Src0Ty);
+      Variable *T = makeReg(Dest->getType());
+      _movp(T, Src0RM);
+      _pand(T, OneMask);
+      _movp(Dest, T);
+    } else {
+      Operand *Src0 = Inst->getSrc(0);
+      if (Src0->getType() == IceType_i64)
+        Src0 = loOperand(Src0);
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // t1 = trunc Src0RM; Dest = t1
+      Variable *T = nullptr;
+      _mov(T, Src0RM);
+      if (Dest->getType() == IceType_i1)
+        _and(T, Ctx->getConstantInt1(1));
+      _mov(Dest, T);
+    }
+    break;
+  }
+  case InstCast::Fptrunc:
+  case InstCast::Fpext: {
+    Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+    // t1 = cvt Src0RM; Dest = t1
+    Variable *T = makeReg(Dest->getType());
+    _cvt(T, Src0RM, InstX8632Cvt::Float2float);
+    _mov(Dest, T);
+    break;
+  }
+  case InstCast::Fptosi:
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4i32 &&
+             Inst->getSrc(0)->getType() == IceType_v4f32);
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
+      Variable *T = makeReg(Dest->getType());
+      _cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
+      _movp(Dest, T);
+    } else if (Dest->getType() == IceType_i64) {
+      // Use a helper for converting floating-point values to 64-bit
+      // integers.  SSE2 appears to have no way to convert from xmm
+      // registers to something like the edx:eax register pair, and
+      // gcc and clang both want to use x87 instructions complete with
+      // temporary manipulation of the status word.  This helper is
+      // not needed for x86-64.
+      split64(Dest);
+      const SizeT MaxSrcs = 1;
+      Type SrcType = Inst->getSrc(0)->getType();
+      InstCall *Call =
+          makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64
+                                                           : H_fptosi_f64_i64,
+                         Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      lowerCall(Call);
+    } else {
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_2 = makeReg(Dest->getType());
+      _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (Dest->getType() == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Fptoui:
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4i32 &&
+             Inst->getSrc(0)->getType() == IceType_v4f32);
+      const SizeT MaxSrcs = 1;
+      InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      lowerCall(Call);
+    } else if (Dest->getType() == IceType_i64 ||
+               Dest->getType() == IceType_i32) {
+      // Use a helper for both x86-32 and x86-64.
+      split64(Dest);
+      const SizeT MaxSrcs = 1;
+      Type DestType = Dest->getType();
+      Type SrcType = Inst->getSrc(0)->getType();
+      IceString TargetString;
+      if (isInt32Asserting32Or64(DestType)) {
+        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
+                                                         : H_fptoui_f64_i32;
+      } else {
+        TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
+                                                         : H_fptoui_f64_i64;
+      }
+      InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
+      Call->addArg(Inst->getSrc(0));
+      lowerCall(Call);
+      return;
+    } else {
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
+      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_2 = makeReg(Dest->getType());
+      _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
+      _mov(T_2, T_1); // T_1 and T_2 may have different integer types
+      if (Dest->getType() == IceType_i1)
+        _and(T_2, Ctx->getConstantInt1(1));
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Sitofp:
+    if (isVectorType(Dest->getType())) {
+      assert(Dest->getType() == IceType_v4f32 &&
+             Inst->getSrc(0)->getType() == IceType_v4i32);
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
+      Variable *T = makeReg(Dest->getType());
+      _cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
+      _movp(Dest, T);
+    } else if (Inst->getSrc(0)->getType() == IceType_i64) {
+      // Use a helper for x86-32.
+      const SizeT MaxSrcs = 1;
+      Type DestType = Dest->getType();
+      InstCall *Call =
+          makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
+                                                            : H_sitofp_i64_f64,
+                         Dest, MaxSrcs);
+      // TODO: Call the correct compiler-rt helper function.
+      Call->addArg(Inst->getSrc(0));
+      lowerCall(Call);
+      return;
+    } else {
+      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      // Sign-extend the operand.
+      // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_2 = makeReg(Dest->getType());
+      if (Src0RM->getType() == IceType_i32)
+        _mov(T_1, Src0RM);
+      else
+        _movsx(T_1, Src0RM);
+      _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  case InstCast::Uitofp: {
+    Operand *Src0 = Inst->getSrc(0);
+    if (isVectorType(Src0->getType())) {
+      assert(Dest->getType() == IceType_v4f32 &&
+             Src0->getType() == IceType_v4i32);
+      const SizeT MaxSrcs = 1;
+      InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } else if (Src0->getType() == IceType_i64 ||
+               Src0->getType() == IceType_i32) {
+      // Use a helper for x86-32 and x86-64.  Also use a helper for
+      // i32 on x86-32.
+      const SizeT MaxSrcs = 1;
+      Type DestType = Dest->getType();
+      IceString TargetString;
+      if (isInt32Asserting32Or64(Src0->getType())) {
+        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
+                                                          : H_uitofp_i32_f64;
+      } else {
+        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
+                                                          : H_uitofp_i64_f64;
+      }
+      InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
+      Call->addArg(Src0);
+      lowerCall(Call);
+      return;
+    } else {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      // Zero-extend the operand.
+      // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
+      Variable *T_1 = makeReg(IceType_i32);
+      Variable *T_2 = makeReg(Dest->getType());
+      if (Src0RM->getType() == IceType_i32)
+        _mov(T_1, Src0RM);
+      else
+        _movzx(T_1, Src0RM);
+      _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
+      _mov(Dest, T_2);
+    }
+    break;
+  }
+  case InstCast::Bitcast: {
+    Operand *Src0 = Inst->getSrc(0);
+    if (Dest->getType() == Src0->getType()) {
+      InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
+      lowerAssign(Assign);
+      return;
+    }
+    switch (Dest->getType()) {
+    default:
+      llvm_unreachable("Unexpected Bitcast dest type");
+    case IceType_i8: {
+      assert(Src0->getType() == IceType_v8i1);
+      InstCall *Call = makeHelperCall(H_bitcast_8xi1_i8, Dest, 1);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } break;
+    case IceType_i16: {
+      assert(Src0->getType() == IceType_v16i1);
+      InstCall *Call = makeHelperCall(H_bitcast_16xi1_i16, Dest, 1);
+      Call->addArg(Src0);
+      lowerCall(Call);
+    } break;
+    case IceType_i32:
+    case IceType_f32: {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Type DestType = Dest->getType();
+      Type SrcType = Src0RM->getType();
+      (void)DestType;
+      assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
+             (DestType == IceType_f32 && SrcType == IceType_i32));
+      // a.i32 = bitcast b.f32 ==>
+      //   t.f32 = b.f32
+      //   s.f32 = spill t.f32
+      //   a.i32 = s.f32
+      Variable *T = nullptr;
+      // TODO: Should be able to force a spill setup by calling legalize() with
+      // Legal_Mem and not Legal_Reg or Legal_Imm.
+      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(SrcType);
+      SpillVar->setLinkedTo(Dest);
+      Variable *Spill = SpillVar;
+      Spill->setWeight(RegWeight::Zero);
+      _mov(T, Src0RM);
+      _mov(Spill, T);
+      _mov(Dest, Spill);
+    } break;
+    case IceType_i64: {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      assert(Src0RM->getType() == IceType_f64);
+      // a.i64 = bitcast b.f64 ==>
+      //   s.f64 = spill b.f64
+      //   t_lo.i32 = lo(s.f64)
+      //   a_lo.i32 = t_lo.i32
+      //   t_hi.i32 = hi(s.f64)
+      //   a_hi.i32 = t_hi.i32
+      Operand *SpillLo, *SpillHi;
+      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+        SpillVariable *SpillVar =
+            Func->makeVariable<SpillVariable>(IceType_f64);
+        SpillVar->setLinkedTo(Src0Var);
+        Variable *Spill = SpillVar;
+        Spill->setWeight(RegWeight::Zero);
+        _movq(Spill, Src0RM);
+        SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
+        SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
+      } else {
+        SpillLo = loOperand(Src0RM);
+        SpillHi = hiOperand(Src0RM);
+      }
+
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Variable *T_Lo = makeReg(IceType_i32);
+      Variable *T_Hi = makeReg(IceType_i32);
+
+      _mov(T_Lo, SpillLo);
+      _mov(DestLo, T_Lo);
+      _mov(T_Hi, SpillHi);
+      _mov(DestHi, T_Hi);
+    } break;
+    case IceType_f64: {
+      Src0 = legalize(Src0);
+      assert(Src0->getType() == IceType_i64);
+      if (llvm::isa<OperandX8632Mem>(Src0)) {
+        Variable *T = Func->makeVariable(Dest->getType());
+        _movq(T, Src0);
+        _movq(Dest, T);
+        break;
+      }
+      // a.f64 = bitcast b.i64 ==>
+      //   t_lo.i32 = b_lo.i32
+      //   FakeDef(s.f64)
+      //   lo(s.f64) = t_lo.i32
+      //   t_hi.i32 = b_hi.i32
+      //   hi(s.f64) = t_hi.i32
+      //   a.f64 = s.f64
+      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
+      SpillVar->setLinkedTo(Dest);
+      Variable *Spill = SpillVar;
+      Spill->setWeight(RegWeight::Zero);
+
+      Variable *T_Lo = nullptr, *T_Hi = nullptr;
+      VariableSplit *SpillLo =
+          VariableSplit::create(Func, Spill, VariableSplit::Low);
+      VariableSplit *SpillHi =
+          VariableSplit::create(Func, Spill, VariableSplit::High);
+      _mov(T_Lo, loOperand(Src0));
+      // Technically, the Spill is defined after the _store happens, but
+      // SpillLo is considered a "use" of Spill so define Spill before it
+      // is used.
+      Context.insert(InstFakeDef::create(Func, Spill));
+      _store(T_Lo, SpillLo);
+      _mov(T_Hi, hiOperand(Src0));
+      _store(T_Hi, SpillHi);
+      _movq(Dest, Spill);
+    } break;
+    case IceType_v8i1: {
+      assert(Src0->getType() == IceType_i8);
+      InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1);
+      Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+      // Arguments to functions are required to be at least 32 bits wide.
+      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
+      Call->addArg(Src0AsI32);
+      lowerCall(Call);
+    } break;
+    case IceType_v16i1: {
+      assert(Src0->getType() == IceType_i16);
+      InstCall *Call = makeHelperCall(H_bitcast_i16_16xi1, Dest, 1);
+      Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
+      // Arguments to functions are required to be at least 32 bits wide.
+      lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
+      Call->addArg(Src0AsI32);
+      lowerCall(Call);
+    } break;
+    case IceType_v8i16:
+    case IceType_v16i8:
+    case IceType_v4i32:
+    case IceType_v4f32: {
+      _movp(Dest, legalizeToVar(Src0));
+    } break;
+    }
+    break;
+  }
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerExtractElement(
+    const InstExtractElement *Inst) {
+  Operand *SourceVectNotLegalized = Inst->getSrc(0);
+  ConstantInteger32 *ElementIndex =
+      llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(1));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+
+  unsigned Index = ElementIndex->getValue();
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+  Variable *ExtractedElementR = makeReg(InVectorElementTy);
+
+  // TODO(wala): Determine the best lowering sequences for each type.
+  bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
+                     InstructionSet >= Machine::SSE4_1;
+  if (CanUsePextr && Ty != IceType_v4f32) {
+    // Use pextrb, pextrw, or pextrd.
+    Constant *Mask = Ctx->getConstantInt32(Index);
+    Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized);
+    _pextr(ExtractedElementR, SourceVectR, Mask);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use pshufd and movd/movss.
+    Variable *T = nullptr;
+    if (Index) {
+      // The shuffle only needs to occur if the element to be extracted
+      // is not at the lowest index.
+      Constant *Mask = Ctx->getConstantInt32(Index);
+      T = makeReg(Ty);
+      _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
+    } else {
+      T = legalizeToVar(SourceVectNotLegalized);
+    }
+
+    if (InVectorElementTy == IceType_i32) {
+      _movd(ExtractedElementR, T);
+    } else { // Ty == IceType_f32
+      // TODO(wala): _movss is only used here because _mov does not
+      // allow a vector source and a scalar destination.  _mov should be
+      // able to be used here.
+      // _movss is a binary instruction, so the FakeDef is needed to
+      // keep the live range analysis consistent.
+      Context.insert(InstFakeDef::create(Func, ExtractedElementR));
+      _movss(ExtractedElementR, T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and do the extraction in memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
+    // support for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setWeight(RegWeight::Zero);
+    _movp(Slot, legalizeToVar(SourceVectNotLegalized));
+
+    // Compute the location of the element in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    OperandX8632Mem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _mov(ExtractedElementR, Loc);
+  }
+
+  if (ElementTy == IceType_i1) {
+    // Truncate extracted integers to i1s if necessary.
+    Variable *T = makeReg(IceType_i1);
+    InstCast *Cast =
+        InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
+    lowerCast(Cast);
+    ExtractedElementR = T;
+  }
+
+  // Copy the element to the destination.
+  Variable *Dest = Inst->getDest();
+  _mov(Dest, ExtractedElementR);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerFcmp(const InstFcmp *Inst) {
+  Operand *Src0 = Inst->getSrc(0);
+  Operand *Src1 = Inst->getSrc(1);
+  Variable *Dest = Inst->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    InstFcmp::FCond Condition = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Condition);
+    assert(Index < Traits::TableFcmpSize);
+
+    if (Traits::TableFcmp[Index].SwapVectorOperands) {
+      Operand *T = Src0;
+      Src0 = Src1;
+      Src1 = T;
+    }
+
+    Variable *T = nullptr;
+
+    if (Condition == InstFcmp::True) {
+      // makeVectorOfOnes() requires an integer vector type.
+      T = makeVectorOfMinusOnes(IceType_v4i32);
+    } else if (Condition == InstFcmp::False) {
+      T = makeVectorOfZeros(Dest->getType());
+    } else {
+      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+      Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
+
+      switch (Condition) {
+      default: {
+        CondX86::CmppsCond Predicate = Traits::TableFcmp[Index].Predicate;
+        assert(Predicate != CondX86::Cmpps_Invalid);
+        T = makeReg(Src0RM->getType());
+        _movp(T, Src0RM);
+        _cmpps(T, Src1RM, Predicate);
+      } break;
+      case InstFcmp::One: {
+        // Check both unequal and ordered.
+        T = makeReg(Src0RM->getType());
+        Variable *T2 = makeReg(Src0RM->getType());
+        _movp(T, Src0RM);
+        _cmpps(T, Src1RM, CondX86::Cmpps_neq);
+        _movp(T2, Src0RM);
+        _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
+        _pand(T, T2);
+      } break;
+      case InstFcmp::Ueq: {
+        // Check both equal or unordered.
+        T = makeReg(Src0RM->getType());
+        Variable *T2 = makeReg(Src0RM->getType());
+        _movp(T, Src0RM);
+        _cmpps(T, Src1RM, CondX86::Cmpps_eq);
+        _movp(T2, Src0RM);
+        _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
+        _por(T, T2);
+      } break;
+      }
+    }
+
+    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
+    return;
+  }
+
+  // Lowering a = fcmp cond, b, c
+  //   ucomiss b, c       /* only if C1 != Br_None */
+  //                      /* but swap b,c order if SwapOperands==true */
+  //   mov a, <default>
+  //   j<C1> label        /* only if C1 != Br_None */
+  //   j<C2> label        /* only if C2 != Br_None */
+  //   FakeUse(a)         /* only if C1 != Br_None */
+  //   mov a, !<default>  /* only if C1 != Br_None */
+  //   label:             /* only if C1 != Br_None */
+  //
+  // setcc lowering when C1 != Br_None && C2 == Br_None:
+  //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
+  //   setcc a, C1
+  InstFcmp::FCond Condition = Inst->getCondition();
+  size_t Index = static_cast<size_t>(Condition);
+  assert(Index < Traits::TableFcmpSize);
+  if (Traits::TableFcmp[Index].SwapScalarOperands)
+    std::swap(Src0, Src1);
+  bool HasC1 = (Traits::TableFcmp[Index].C1 != CondX86::Br_None);
+  bool HasC2 = (Traits::TableFcmp[Index].C2 != CondX86::Br_None);
+  if (HasC1) {
+    Src0 = legalize(Src0);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+    Variable *T = nullptr;
+    _mov(T, Src0);
+    _ucomiss(T, Src1RM);
+    if (!HasC2) {
+      assert(Traits::TableFcmp[Index].Default);
+      _setcc(Dest, Traits::TableFcmp[Index].C1);
+      return;
+    }
+  }
+  Constant *Default = Ctx->getConstantInt32(Traits::TableFcmp[Index].Default);
+  _mov(Dest, Default);
+  if (HasC1) {
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
+    _br(Traits::TableFcmp[Index].C1, Label);
+    if (HasC2) {
+      _br(Traits::TableFcmp[Index].C2, Label);
+    }
+    Constant *NonDefault =
+        Ctx->getConstantInt32(!Traits::TableFcmp[Index].Default);
+    _mov_nonkillable(Dest, NonDefault);
+    Context.insert(Label);
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerIcmp(const InstIcmp *Inst) {
+  Operand *Src0 = legalize(Inst->getSrc(0));
+  Operand *Src1 = legalize(Inst->getSrc(1));
+  Variable *Dest = Inst->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    Type Ty = Src0->getType();
+    // Promote i1 vectors to 128 bit integer vector types.
+    if (typeElementType(Ty) == IceType_i1) {
+      Type NewTy = IceType_NUM;
+      switch (Ty) {
+      default:
+        llvm_unreachable("unexpected type");
+        break;
+      case IceType_v4i1:
+        NewTy = IceType_v4i32;
+        break;
+      case IceType_v8i1:
+        NewTy = IceType_v8i16;
+        break;
+      case IceType_v16i1:
+        NewTy = IceType_v16i8;
+        break;
+      }
+      Variable *NewSrc0 = Func->makeVariable(NewTy);
+      Variable *NewSrc1 = Func->makeVariable(NewTy);
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
+      lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
+      Src0 = NewSrc0;
+      Src1 = NewSrc1;
+      Ty = NewTy;
+    }
+
+    InstIcmp::ICond Condition = Inst->getCondition();
+
+    Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
+    Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+
+    // SSE2 only has signed comparison operations.  Transform unsigned
+    // inputs in a manner that allows for the use of signed comparison
+    // operations by flipping the high order bits.
+    if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
+        Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
+      Variable *T0 = makeReg(Ty);
+      Variable *T1 = makeReg(Ty);
+      Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
+      _movp(T0, Src0RM);
+      _pxor(T0, HighOrderBits);
+      _movp(T1, Src1RM);
+      _pxor(T1, HighOrderBits);
+      Src0RM = T0;
+      Src1RM = T1;
+    }
+
+    Variable *T = makeReg(Ty);
+    switch (Condition) {
+    default:
+      llvm_unreachable("unexpected condition");
+      break;
+    case InstIcmp::Eq: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
+      _movp(T, Src0RM);
+      _pcmpeq(T, Src1RM);
+    } break;
+    case InstIcmp::Ne: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
+      _movp(T, Src0RM);
+      _pcmpeq(T, Src1RM);
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ugt:
+    case InstIcmp::Sgt: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
+      _movp(T, Src0RM);
+      _pcmpgt(T, Src1RM);
+    } break;
+    case InstIcmp::Uge:
+    case InstIcmp::Sge: {
+      // !(Src1RM > Src0RM)
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
+      _movp(T, Src1RM);
+      _pcmpgt(T, Src0RM);
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    case InstIcmp::Ult:
+    case InstIcmp::Slt: {
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
+      _movp(T, Src1RM);
+      _pcmpgt(T, Src0RM);
+    } break;
+    case InstIcmp::Ule:
+    case InstIcmp::Sle: {
+      // !(Src0RM > Src1RM)
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
+      _movp(T, Src0RM);
+      _pcmpgt(T, Src1RM);
+      Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+      _pxor(T, MinusOne);
+    } break;
+    }
+
+    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
+    return;
+  }
+
+  // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
+  if (Src0->getType() == IceType_i64) {
+    InstIcmp::ICond Condition = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Condition);
+    assert(Index < Traits::TableIcmp64Size);
+    Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
+    Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
+    Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
+    Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
+    Constant *Zero = Ctx->getConstantZero(IceType_i32);
+    Constant *One = Ctx->getConstantInt32(1);
+    InstX8632Label *LabelFalse = InstX8632Label::create(Func, this);
+    InstX8632Label *LabelTrue = InstX8632Label::create(Func, this);
+    _mov(Dest, One);
+    _cmp(Src0HiRM, Src1HiRI);
+    if (Traits::TableIcmp64[Index].C1 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Index].C1, LabelTrue);
+    if (Traits::TableIcmp64[Index].C2 != CondX86::Br_None)
+      _br(Traits::TableIcmp64[Index].C2, LabelFalse);
+    _cmp(Src0LoRM, Src1LoRI);
+    _br(Traits::TableIcmp64[Index].C3, LabelTrue);
+    Context.insert(LabelFalse);
+    _mov_nonkillable(Dest, Zero);
+    Context.insert(LabelTrue);
+    return;
+  }
+
+  // cmp b, c
+  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+  _cmp(Src0RM, Src1);
+  _setcc(Dest, Traits::getIcmp32Mapping(Inst->getCondition()));
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerInsertElement(const InstInsertElement *Inst) {
+  Operand *SourceVectNotLegalized = Inst->getSrc(0);
+  Operand *ElementToInsertNotLegalized = Inst->getSrc(1);
+  ConstantInteger32 *ElementIndex =
+      llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(2));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+  unsigned Index = ElementIndex->getValue();
+  assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
+
+  Type Ty = SourceVectNotLegalized->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = Traits::getInVectorElementType(Ty);
+
+  if (ElementTy == IceType_i1) {
+    // Expand the element to the appropriate size for it to be inserted
+    // in the vector.
+    Variable *Expanded = Func->makeVariable(InVectorElementTy);
+    InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
+                                      ElementToInsertNotLegalized);
+    lowerCast(Cast);
+    ElementToInsertNotLegalized = Expanded;
+  }
+
+  if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
+      InstructionSet >= Machine::SSE4_1) {
+    // Use insertps, pinsrb, pinsrw, or pinsrd.
+    Operand *ElementRM =
+        legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+    Variable *T = makeReg(Ty);
+    _movp(T, SourceVectRM);
+    if (Ty == IceType_v4f32)
+      _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
+    else
+      _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
+    _movp(Inst->getDest(), T);
+  } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Use shufps or movss.
+    Variable *ElementR = nullptr;
+    Operand *SourceVectRM =
+        legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
+
+    if (InVectorElementTy == IceType_f32) {
+      // ElementR will be in an XMM register since it is floating point.
+      ElementR = legalizeToVar(ElementToInsertNotLegalized);
+    } else {
+      // Copy an integer to an XMM register.
+      Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
+      ElementR = makeReg(Ty);
+      _movd(ElementR, T);
+    }
+
+    if (Index == 0) {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _movss(T, ElementR);
+      _movp(Inst->getDest(), T);
+      return;
+    }
+
+    // shufps treats the source and desination operands as vectors of
+    // four doublewords.  The destination's two high doublewords are
+    // selected from the source operand and the two low doublewords are
+    // selected from the (original value of) the destination operand.
+    // An insertelement operation can be effected with a sequence of two
+    // shufps operations with appropriate masks.  In all cases below,
+    // Element[0] is being inserted into SourceVectOperand.  Indices are
+    // ordered from left to right.
+    //
+    // insertelement into index 1 (result is stored in ElementR):
+    //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
+    //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
+    //
+    // insertelement into index 2 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 3]
+    //   T := T[0, 1] ElementR[0, 3]
+    //
+    // insertelement into index 3 (result is stored in T):
+    //   T := SourceVectRM
+    //   ElementR := ElementR[0, 0] T[0, 2]
+    //   T := T[0, 1] ElementR[3, 0]
+    const unsigned char Mask1[3] = {0, 192, 128};
+    const unsigned char Mask2[3] = {227, 196, 52};
+
+    Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
+    Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
+
+    if (Index == 1) {
+      _shufps(ElementR, SourceVectRM, Mask1Constant);
+      _shufps(ElementR, SourceVectRM, Mask2Constant);
+      _movp(Inst->getDest(), ElementR);
+    } else {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectRM);
+      _shufps(ElementR, T, Mask1Constant);
+      _shufps(T, ElementR, Mask2Constant);
+      _movp(Inst->getDest(), T);
+    }
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and perform the insertion in
+    // memory.
+    //
+    // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
+    // support for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty);
+    Slot->setWeight(RegWeight::Zero);
+    _movp(Slot, legalizeToVar(SourceVectNotLegalized));
+
+    // Compute the location of the position to insert in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    OperandX8632Mem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _store(legalizeToVar(ElementToInsertNotLegalized), Loc);
+
+    Variable *T = makeReg(Ty);
+    _movp(T, Slot);
+    _movp(Inst->getDest(), T);
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerIntrinsicCall(
+    const InstIntrinsicCall *Instr) {
+  switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
+  case Intrinsics::AtomicCmpxchg: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)),
+            getConstantMemoryOrder(Instr->getArg(4)))) {
+      Func->setError("Unexpected memory ordering for AtomicCmpxchg");
+      return;
+    }
+    Variable *DestPrev = Instr->getDest();
+    Operand *PtrToMem = Instr->getArg(0);
+    Operand *Expected = Instr->getArg(1);
+    Operand *Desired = Instr->getArg(2);
+    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
+      return;
+    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+    return;
+  }
+  case Intrinsics::AtomicFence:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(0)))) {
+      Func->setError("Unexpected memory ordering for AtomicFence");
+      return;
+    }
+    _mfence();
+    return;
+  case Intrinsics::AtomicFenceAll:
+    // NOTE: FenceAll should prevent and load/store from being moved
+    // across the fence (both atomic and non-atomic). The InstX8632Mfence
+    // instruction is currently marked coarsely as "HasSideEffects".
+    _mfence();
+    return;
+  case Intrinsics::AtomicIsLockFree: {
+    // X86 is always lock free for 8/16/32/64 bit accesses.
+    // TODO(jvoung): Since the result is constant when given a constant
+    // byte size, this opens up DCE opportunities.
+    Operand *ByteSize = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
+      Constant *Result;
+      switch (CI->getValue()) {
+      default:
+        // Some x86-64 processors support the cmpxchg16b intruction, which
+        // can make 16-byte operations lock free (when used with the LOCK
+        // prefix). However, that's not supported in 32-bit mode, so just
+        // return 0 even for large sizes.
+        Result = Ctx->getConstantZero(IceType_i32);
+        break;
+      case 1:
+      case 2:
+      case 4:
+      case 8:
+        Result = Ctx->getConstantInt32(1);
+        break;
+      }
+      _mov(Dest, Result);
+      return;
+    }
+    // The PNaCl ABI requires the byte size to be a compile-time constant.
+    Func->setError("AtomicIsLockFree byte size should be compile-time const");
+    return;
+  }
+  case Intrinsics::AtomicLoad: {
+    // We require the memory address to be naturally aligned.
+    // Given that is the case, then normal loads are atomic.
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(1)))) {
+      Func->setError("Unexpected memory ordering for AtomicLoad");
+      return;
+    }
+    Variable *Dest = Instr->getDest();
+    if (Dest->getType() == IceType_i64) {
+      // Follow what GCC does and use a movq instead of what lowerLoad()
+      // normally does (split the load into two).
+      // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
+      // can't happen anyway, since this is x86-32 and integer arithmetic only
+      // happens on 32-bit quantities.
+      Variable *T = makeReg(IceType_f64);
+      OperandX8632Mem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
+      _movq(T, Addr);
+      // Then cast the bits back out of the XMM register to the i64 Dest.
+      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
+      lowerCast(Cast);
+      // Make sure that the atomic load isn't elided when unused.
+      Context.insert(InstFakeUse::create(Func, Dest->getLo()));
+      Context.insert(InstFakeUse::create(Func, Dest->getHi()));
+      return;
+    }
+    InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
+    lowerLoad(Load);
+    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
+    // Since lowerLoad may fuse the load w/ an arithmetic instruction,
+    // insert the FakeUse on the last-inserted instruction's dest.
+    Context.insert(
+        InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
+    return;
+  }
+  case Intrinsics::AtomicRMW:
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(3)))) {
+      Func->setError("Unexpected memory ordering for AtomicRMW");
+      return;
+    }
+    lowerAtomicRMW(Instr->getDest(),
+                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
+                                             Instr->getArg(0))->getValue()),
+                   Instr->getArg(1), Instr->getArg(2));
+    return;
+  case Intrinsics::AtomicStore: {
+    if (!Intrinsics::isMemoryOrderValid(
+            ID, getConstantMemoryOrder(Instr->getArg(2)))) {
+      Func->setError("Unexpected memory ordering for AtomicStore");
+      return;
+    }
+    // We require the memory address to be naturally aligned.
+    // Given that is the case, then normal stores are atomic.
+    // Add a fence after the store to make it visible.
+    Operand *Value = Instr->getArg(0);
+    Operand *Ptr = Instr->getArg(1);
+    if (Value->getType() == IceType_i64) {
+      // Use a movq instead of what lowerStore() normally does
+      // (split the store into two), following what GCC does.
+      // Cast the bits from int -> to an xmm register first.
+      Variable *T = makeReg(IceType_f64);
+      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
+      lowerCast(Cast);
+      // Then store XMM w/ a movq.
+      OperandX8632Mem *Addr = formMemoryOperand(Ptr, IceType_f64);
+      _storeq(T, Addr);
+      _mfence();
+      return;
+    }
+    InstStore *Store = InstStore::create(Func, Value, Ptr);
+    lowerStore(Store);
+    _mfence();
+    return;
+  }
+  case Intrinsics::Bswap: {
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    // In 32-bit mode, bswap only works on 32-bit arguments, and the
+    // argument must be a register. Use rotate left for 16-bit bswap.
+    if (Val->getType() == IceType_i64) {
+      Variable *T_Lo = legalizeToVar(loOperand(Val));
+      Variable *T_Hi = legalizeToVar(hiOperand(Val));
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      _bswap(T_Lo);
+      _bswap(T_Hi);
+      _mov(DestLo, T_Hi);
+      _mov(DestHi, T_Lo);
+    } else if (Val->getType() == IceType_i32) {
+      Variable *T = legalizeToVar(Val);
+      _bswap(T);
+      _mov(Dest, T);
+    } else {
+      assert(Val->getType() == IceType_i16);
+      Val = legalize(Val);
+      Constant *Eight = Ctx->getConstantInt16(8);
+      Variable *T = nullptr;
+      _mov(T, Val);
+      _rol(T, Eight);
+      _mov(Dest, T);
+    }
+    return;
+  }
+  case Intrinsics::Ctpop: {
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
+                                        ? H_call_ctpop_i32
+                                        : H_call_ctpop_i64,
+                                    Dest, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches the native POPCNT instruction and fills a 64-bit reg
+    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
+    // the user doesn't do that in the IR. If the user does that in the IR,
+    // then this zero'ing instruction is dead and gets optimized out.
+    if (Val->getType() == IceType_i64) {
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(DestHi, Zero);
+    }
+    return;
+  }
+  case Intrinsics::Ctlz: {
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (Val->getType() == IceType_i64) {
+      FirstVal = loOperand(Val);
+      SecondVal = hiOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    const bool IsCttz = false;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Cttz: {
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = nullptr;
+    if (Val->getType() == IceType_i64) {
+      FirstVal = hiOperand(Val);
+      SecondVal = loOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    const bool IsCttz = true;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Fabs: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Type Ty = Src->getType();
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeVectorOfFabsMask(Ty);
+    // The pand instruction operates on an m128 memory operand, so if
+    // Src is an f32 or f64, we need to make sure it's in a register.
+    if (isVectorType(Ty)) {
+      if (llvm::isa<OperandX8632Mem>(Src))
+        Src = legalizeToVar(Src);
+    } else {
+      Src = legalizeToVar(Src);
+    }
+    _pand(T, Src);
+    if (isVectorType(Ty))
+      _movp(Dest, T);
+    else
+      _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::Longjmp: {
+    InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Memcpy: {
+    // In the future, we could potentially emit an inline memcpy/memset, etc.
+    // for intrinsic calls w/ a known length.
+    InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Memmove: {
+    InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Memset: {
+    // The value operand needs to be extended to a stack slot size
+    // because the PNaCl ABI requires arguments to be at least 32 bits
+    // wide.
+    Operand *ValOp = Instr->getArg(1);
+    assert(ValOp->getType() == IceType_i8);
+    Variable *ValExt = Func->makeVariable(stackSlotType());
+    lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
+    InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(ValExt);
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::NaClReadTP: {
+    if (Ctx->getFlags().getUseSandboxing()) {
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      Operand *Src =
+          OperandX8632Mem::create(Func, IceType_i32, nullptr, Zero, nullptr, 0,
+                                  OperandX8632Mem::SegReg_GS);
+      Variable *Dest = Instr->getDest();
+      Variable *T = nullptr;
+      _mov(T, Src);
+      _mov(Dest, T);
+    } else {
+      InstCall *Call = makeHelperCall(H_call_read_tp, Instr->getDest(), 0);
+      lowerCall(Call);
+    }
+    return;
+  }
+  case Intrinsics::Setjmp: {
+    InstCall *Call = makeHelperCall(H_call_setjmp, Instr->getDest(), 1);
+    Call->addArg(Instr->getArg(0));
+    lowerCall(Call);
+    return;
+  }
+  case Intrinsics::Sqrt: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(Dest->getType());
+    _sqrtss(T, Src);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::Stacksave: {
+    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
+    Variable *Dest = Instr->getDest();
+    _mov(Dest, esp);
+    return;
+  }
+  case Intrinsics::Stackrestore: {
+    Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
+    _mov_nonkillable(esp, Instr->getArg(0));
+    return;
+  }
+  case Intrinsics::Trap:
+    _ud2();
+    return;
+  case Intrinsics::UnknownIntrinsic:
+    Func->setError("Should not be lowering UnknownIntrinsic");
+    return;
+  }
+  return;
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerAtomicCmpxchg(Variable *DestPrev,
+                                                Operand *Ptr, Operand *Expected,
+                                                Operand *Desired) {
+  if (Expected->getType() == IceType_i64) {
+    // Reserve the pre-colored registers first, before adding any more
+    // infinite-weight variables from formMemoryOperand's legalization.
+    Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
+    Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
+    _mov(T_eax, loOperand(Expected));
+    _mov(T_edx, hiOperand(Expected));
+    _mov(T_ebx, loOperand(Desired));
+    _mov(T_ecx, hiOperand(Desired));
+    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  Variable *T_eax = makeReg(Expected->getType(), RegX8632::Reg_eax);
+  _mov(T_eax, Expected);
+  OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
+  Variable *DesiredReg = legalizeToVar(Desired);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+  _mov(DestPrev, T_eax);
+}
+
+template <class Machine>
+bool TargetX86Base<Machine>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
+                                                      Operand *PtrToMem,
+                                                      Operand *Expected,
+                                                      Operand *Desired) {
+  if (Ctx->getFlags().getOptLevel() == Opt_m1)
+    return false;
+  // Peek ahead a few instructions and see how Dest is used.
+  // It's very common to have:
+  //
+  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
+  // [%y_phi = ...] // list of phi stores
+  // %p = icmp eq i32 %x, %expected
+  // br i1 %p, label %l1, label %l2
+  //
+  // which we can optimize into:
+  //
+  // %x = <cmpxchg code>
+  // [%y_phi = ...] // list of phi stores
+  // br eq, %l1, %l2
+  InstList::iterator I = Context.getCur();
+  // I is currently the InstIntrinsicCall. Peek past that.
+  // This assumes that the atomic cmpxchg has not been lowered yet,
+  // so that the instructions seen in the scan from "Cur" is simple.
+  assert(llvm::isa<InstIntrinsicCall>(*I));
+  Inst *NextInst = Context.getNextInst(I);
+  if (!NextInst)
+    return false;
+  // There might be phi assignments right before the compare+branch, since this
+  // could be a backward branch for a loop. This placement of assignments is
+  // determined by placePhiStores().
+  std::vector<InstAssign *> PhiAssigns;
+  while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
+    if (PhiAssign->getDest() == Dest)
+      return false;
+    PhiAssigns.push_back(PhiAssign);
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+  }
+  if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
+    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
+          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
+           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
+      return false;
+    }
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+    if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
+      if (!NextBr->isUnconditional() &&
+          NextCmp->getDest() == NextBr->getCondition() &&
+          NextBr->isLastUse(NextCmp->getDest())) {
+        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
+        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
+          // Lower the phi assignments now, before the branch (same placement
+          // as before).
+          InstAssign *PhiAssign = PhiAssigns[i];
+          PhiAssign->setDeleted();
+          lowerAssign(PhiAssign);
+          Context.advanceNext();
+        }
+        _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
+        // Skip over the old compare and branch, by deleting them.
+        NextCmp->setDeleted();
+        NextBr->setDeleted();
+        Context.advanceNext();
+        Context.advanceNext();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
+                                            Operand *Ptr, Operand *Val) {
+  bool NeedsCmpxchg = false;
+  LowerBinOp Op_Lo = nullptr;
+  LowerBinOp Op_Hi = nullptr;
+  switch (Operation) {
+  default:
+    Func->setError("Unknown AtomicRMW operation");
+    return;
+  case Intrinsics::AtomicAdd: {
+    if (Dest->getType() == IceType_i64) {
+      // All the fall-through paths must set this to true, but use this
+      // for asserting.
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX86Base<Machine>::_add;
+      Op_Hi = &TargetX86Base<Machine>::_adc;
+      break;
+    }
+    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    const bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicSub: {
+    if (Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX86Base<Machine>::_sub;
+      Op_Hi = &TargetX86Base<Machine>::_sbb;
+      break;
+    }
+    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    const bool Locked = true;
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _neg(T);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicOr:
+    // TODO(jvoung): If Dest is null or dead, then some of these
+    // operations do not need an "exchange", but just a locked op.
+    // That appears to be "worth" it for sub, or, and, and xor.
+    // xadd is probably fine vs lock add for add, and xchg is fine
+    // vs an atomic store.
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX86Base<Machine>::_or;
+    Op_Hi = &TargetX86Base<Machine>::_or;
+    break;
+  case Intrinsics::AtomicAnd:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX86Base<Machine>::_and;
+    Op_Hi = &TargetX86Base<Machine>::_and;
+    break;
+  case Intrinsics::AtomicXor:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX86Base<Machine>::_xor;
+    Op_Hi = &TargetX86Base<Machine>::_xor;
+    break;
+  case Intrinsics::AtomicExchange:
+    if (Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+      // just need to be moved to the ecx and ebx registers.
+      Op_Lo = nullptr;
+      Op_Hi = nullptr;
+      break;
+    }
+    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
+    Variable *T = nullptr;
+    _mov(T, Val);
+    _xchg(Addr, T);
+    _mov(Dest, T);
+    return;
+  }
+  // Otherwise, we need a cmpxchg loop.
+  (void)NeedsCmpxchg;
+  assert(NeedsCmpxchg);
+  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
+                                                      LowerBinOp Op_Hi,
+                                                      Variable *Dest,
+                                                      Operand *Ptr,
+                                                      Operand *Val) {
+  // Expand a more complex RMW operation as a cmpxchg loop:
+  // For 64-bit:
+  //   mov     eax, [ptr]
+  //   mov     edx, [ptr + 4]
+  // .LABEL:
+  //   mov     ebx, eax
+  //   <Op_Lo> ebx, <desired_adj_lo>
+  //   mov     ecx, edx
+  //   <Op_Hi> ecx, <desired_adj_hi>
+  //   lock cmpxchg8b [ptr]
+  //   jne     .LABEL
+  //   mov     <dest_lo>, eax
+  //   mov     <dest_lo>, edx
+  //
+  // For 32-bit:
+  //   mov     eax, [ptr]
+  // .LABEL:
+  //   mov     <reg>, eax
+  //   op      <reg>, [desired_adj]
+  //   lock cmpxchg [ptr], <reg>
+  //   jne     .LABEL
+  //   mov     <dest>, eax
+  //
+  // If Op_{Lo,Hi} are nullptr, then just copy the value.
+  Val = legalize(Val);
+  Type Ty = Val->getType();
+  if (Ty == IceType_i64) {
+    Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
+    OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
+    _mov(T_eax, loOperand(Addr));
+    _mov(T_edx, hiOperand(Addr));
+    Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
+    const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
+    if (!IsXchg8b) {
+      Context.insert(Label);
+      _mov(T_ebx, T_eax);
+      (this->*Op_Lo)(T_ebx, loOperand(Val));
+      _mov(T_ecx, T_edx);
+      (this->*Op_Hi)(T_ecx, hiOperand(Val));
+    } else {
+      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+      // It just needs the Val loaded into ebx and ecx.
+      // That can also be done before the loop.
+      _mov(T_ebx, loOperand(Val));
+      _mov(T_ecx, hiOperand(Val));
+      Context.insert(Label);
+    }
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    _br(CondX86::Br_ne, Label);
+    if (!IsXchg8b) {
+      // If Val is a variable, model the extended live range of Val through
+      // the end of the loop, since it will be re-used by the loop.
+      if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+        Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+        Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+        Context.insert(InstFakeUse::create(Func, ValLo));
+        Context.insert(InstFakeUse::create(Func, ValHi));
+      }
+    } else {
+      // For xchg, the loop is slightly smaller and ebx/ecx are used.
+      Context.insert(InstFakeUse::create(Func, T_ebx));
+      Context.insert(InstFakeUse::create(Func, T_ecx));
+    }
+    // The address base (if any) is also reused in the loop.
+    if (Variable *Base = Addr->getBase())
+      Context.insert(InstFakeUse::create(Func, Base));
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
+  Variable *T_eax = makeReg(Ty, RegX8632::Reg_eax);
+  _mov(T_eax, Addr);
+  InstX8632Label *Label = InstX8632Label::create(Func, this);
+  Context.insert(Label);
+  // We want to pick a different register for T than Eax, so don't use
+  // _mov(T == nullptr, T_eax).
+  Variable *T = makeReg(Ty);
+  _mov(T, T_eax);
+  (this->*Op_Lo)(T, Val);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, T, Locked);
+  _br(CondX86::Br_ne, Label);
+  // If Val is a variable, model the extended live range of Val through
+  // the end of the loop, since it will be re-used by the loop.
+  if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+    Context.insert(InstFakeUse::create(Func, ValVar));
+  }
+  // The address base (if any) is also reused in the loop.
+  if (Variable *Base = Addr->getBase())
+    Context.insert(InstFakeUse::create(Func, Base));
+  _mov(Dest, T_eax);
+}
+
+// Lowers count {trailing, leading} zeros intrinsic.
+//
+// We could do constant folding here, but that should have
+// been done by the front-end/middle-end optimizations.
+template <class Machine>
+void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
+                                             Operand *FirstVal,
+                                             Operand *SecondVal) {
+  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
+  // Then the instructions will handle the Val == 0 case much more simply
+  // and won't require conversion from bit position to number of zeros.
+  //
+  // Otherwise:
+  //   bsr IF_NOT_ZERO, Val
+  //   mov T_DEST, 63
+  //   cmovne T_DEST, IF_NOT_ZERO
+  //   xor T_DEST, 31
+  //   mov DEST, T_DEST
+  //
+  // NOTE: T_DEST must be a register because cmov requires its dest to be a
+  // register. Also, bsf and bsr require their dest to be a register.
+  //
+  // The xor DEST, 31 converts a bit position to # of leading zeroes.
+  // E.g., for 000... 00001100, bsr will say that the most significant bit
+  // set is at position 3, while the number of leading zeros is 28. Xor is
+  // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
+  //
+  // Similar for 64-bit, but start w/ speculating that the upper 32 bits
+  // are all zero, and compute the result for that case (checking the lower
+  // 32 bits). Then actually compute the result for the upper bits and
+  // cmov in the result from the lower computation if the earlier speculation
+  // was correct.
+  //
+  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
+  // bit position conversion, and the speculation is reversed.
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  Variable *T = makeReg(IceType_i32);
+  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
+  if (Cttz) {
+    _bsf(T, FirstValRM);
+  } else {
+    _bsr(T, FirstValRM);
+  }
+  Variable *T_Dest = makeReg(IceType_i32);
+  Constant *ThirtyTwo = Ctx->getConstantInt32(32);
+  Constant *ThirtyOne = Ctx->getConstantInt32(31);
+  if (Cttz) {
+    _mov(T_Dest, ThirtyTwo);
+  } else {
+    Constant *SixtyThree = Ctx->getConstantInt32(63);
+    _mov(T_Dest, SixtyThree);
+  }
+  _cmov(T_Dest, T, CondX86::Br_ne);
+  if (!Cttz) {
+    _xor(T_Dest, ThirtyOne);
+  }
+  if (Ty == IceType_i32) {
+    _mov(Dest, T_Dest);
+    return;
+  }
+  _add(T_Dest, ThirtyTwo);
+  Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+  Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+  // Will be using "test" on this, so we need a registerized variable.
+  Variable *SecondVar = legalizeToVar(SecondVal);
+  Variable *T_Dest2 = makeReg(IceType_i32);
+  if (Cttz) {
+    _bsf(T_Dest2, SecondVar);
+  } else {
+    _bsr(T_Dest2, SecondVar);
+    _xor(T_Dest2, ThirtyOne);
+  }
+  _test(SecondVar, SecondVar);
+  _cmov(T_Dest2, T_Dest, CondX86::Br_e);
+  _mov(DestLo, T_Dest2);
+  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+}
+
+bool isAdd(const Inst *Inst) {
+  if (const InstArithmetic *Arith =
+          llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
+    return (Arith->getOp() == InstArithmetic::Add);
+  }
+  return false;
+}
+
+void dumpAddressOpt(const Cfg *Func, const Variable *Base,
+                    const Variable *Index, uint16_t Shift, int32_t Offset,
+                    const Inst *Reason) {
+  if (!ALLOW_DUMP)
+    return;
+  if (!Func->isVerbose(IceV_AddrOpt))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "Instruction: ";
+  Reason->dumpDecorated(Func);
+  Str << "  results in Base=";
+  if (Base)
+    Base->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Index=";
+  if (Index)
+    Index->dump(Func);
+  else
+    Str << "<null>";
+  Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n";
+}
+
+bool matchTransitiveAssign(const VariablesMetadata *VMetadata, Variable *&Var,
+                           const Inst *&Reason) {
+  // Var originates from Var=SrcVar ==>
+  //   set Var:=SrcVar
+  if (Var == nullptr)
+    return false;
+  if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) {
+    assert(!VMetadata->isMultiDef(Var));
+    if (llvm::isa<InstAssign>(VarAssign)) {
+      Operand *SrcOp = VarAssign->getSrc(0);
+      assert(SrcOp);
+      if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
+        if (!VMetadata->isMultiDef(SrcVar) &&
+            // TODO: ensure SrcVar stays single-BB
+            true) {
+          Var = SrcVar;
+          Reason = VarAssign;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable *&Base,
+                            Variable *&Index, uint16_t &Shift,
+                            const Inst *&Reason) {
+  // Index==nullptr && Base is Base=Var1+Var2 ==>
+  //   set Base=Var1, Index=Var2, Shift=0
+  if (Base == nullptr)
+    return false;
+  if (Index != nullptr)
+    return false;
+  const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
+  if (BaseInst == nullptr)
+    return false;
+  assert(!VMetadata->isMultiDef(Base));
+  if (BaseInst->getSrcSize() < 2)
+    return false;
+  if (Variable *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
+    if (VMetadata->isMultiDef(Var1))
+      return false;
+    if (Variable *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
+      if (VMetadata->isMultiDef(Var2))
+        return false;
+      if (isAdd(BaseInst) &&
+          // TODO: ensure Var1 and Var2 stay single-BB
+          true) {
+        Base = Var1;
+        Index = Var2;
+        Shift = 0; // should already have been 0
+        Reason = BaseInst;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool matchShiftedIndex(const VariablesMetadata *VMetadata, Variable *&Index,
+                       uint16_t &Shift, const Inst *&Reason) {
+  // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
+  //   Index=Var, Shift+=log2(Const)
+  if (Index == nullptr)
+    return false;
+  const Inst *IndexInst = VMetadata->getSingleDefinition(Index);
+  if (IndexInst == nullptr)
+    return false;
+  assert(!VMetadata->isMultiDef(Index));
+  if (IndexInst->getSrcSize() < 2)
+    return false;
+  if (const InstArithmetic *ArithInst =
+          llvm::dyn_cast<InstArithmetic>(IndexInst)) {
+    if (Variable *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
+      if (ConstantInteger32 *Const =
+              llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
+        if (ArithInst->getOp() == InstArithmetic::Mul &&
+            !VMetadata->isMultiDef(Var) && Const->getType() == IceType_i32) {
+          uint64_t Mult = Const->getValue();
+          uint32_t LogMult;
+          switch (Mult) {
+          case 1:
+            LogMult = 0;
+            break;
+          case 2:
+            LogMult = 1;
+            break;
+          case 4:
+            LogMult = 2;
+            break;
+          case 8:
+            LogMult = 3;
+            break;
+          default:
+            return false;
+          }
+          if (Shift + LogMult <= 3) {
+            Index = Var;
+            Shift += LogMult;
+            Reason = IndexInst;
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable *&Base,
+                     int32_t &Offset, const Inst *&Reason) {
+  // Base is Base=Var+Const || Base is Base=Const+Var ==>
+  //   set Base=Var, Offset+=Const
+  // Base is Base=Var-Const ==>
+  //   set Base=Var, Offset-=Const
+  if (Base == nullptr)
+    return false;
+  const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
+  if (BaseInst == nullptr)
+    return false;
+  assert(!VMetadata->isMultiDef(Base));
+  if (const InstArithmetic *ArithInst =
+          llvm::dyn_cast<const InstArithmetic>(BaseInst)) {
+    if (ArithInst->getOp() != InstArithmetic::Add &&
+        ArithInst->getOp() != InstArithmetic::Sub)
+      return false;
+    bool IsAdd = ArithInst->getOp() == InstArithmetic::Add;
+    Variable *Var = nullptr;
+    ConstantInteger32 *Const = nullptr;
+    if (Variable *VariableOperand =
+            llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
+      Var = VariableOperand;
+      Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1));
+    } else if (IsAdd) {
+      Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(0));
+      Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(1));
+    }
+    if (Var == nullptr || Const == nullptr || VMetadata->isMultiDef(Var))
+      return false;
+    int32_t MoreOffset = IsAdd ? Const->getValue() : -Const->getValue();
+    if (Utils::WouldOverflowAdd(Offset, MoreOffset))
+      return false;
+    Base = Var;
+    Offset += MoreOffset;
+    Reason = BaseInst;
+    return true;
+  }
+  return false;
+}
+
+void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
+                       Variable *&Index, uint16_t &Shift, int32_t &Offset) {
+  Func->resetCurrentNode();
+  if (Func->isVerbose(IceV_AddrOpt)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << "\nStarting computeAddressOpt for instruction:\n  ";
+    Instr->dumpDecorated(Func);
+  }
+  (void)Offset; // TODO: pattern-match for non-zero offsets.
+  if (Base == nullptr)
+    return;
+  // If the Base has more than one use or is live across multiple
+  // blocks, then don't go further.  Alternatively (?), never consider
+  // a transformation that would change a variable that is currently
+  // *not* live across basic block boundaries into one that *is*.
+  if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/)
+    return;
+
+  const VariablesMetadata *VMetadata = Func->getVMetadata();
+  bool Continue = true;
+  while (Continue) {
+    const Inst *Reason = nullptr;
+    if (matchTransitiveAssign(VMetadata, Base, Reason) ||
+        matchTransitiveAssign(VMetadata, Index, Reason) ||
+        matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) ||
+        matchShiftedIndex(VMetadata, Index, Shift, Reason) ||
+        matchOffsetBase(VMetadata, Base, Offset, Reason)) {
+      dumpAddressOpt(Func, Base, Index, Shift, Offset, Reason);
+    } else {
+      Continue = false;
+    }
+
+    // Index is Index=Var<<Const && Const+Shift<=3 ==>
+    //   Index=Var, Shift+=Const
+
+    // Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
+    //   Index=Var, Shift+=log2(Const)
+
+    // Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
+    //   swap(Index,Base)
+    // Similar for Base=Const*Var and Base=Var<<Const
+
+    // Index is Index=Var+Const ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+
+    // Index is Index=Const+Var ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+
+    // Index is Index=Var-Const ==>
+    //   set Index=Var, Offset-=(Const<<Shift)
+
+    // TODO: consider overflow issues with respect to Offset.
+    // TODO: handle symbolic constants.
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerLoad(const InstLoad *Load) {
+  // A Load instruction can be treated the same as an Assign
+  // instruction, after the source operand is transformed into an
+  // OperandX8632Mem operand.  Note that the address mode
+  // optimization already creates an OperandX8632Mem operand, so it
+  // doesn't need another level of transformation.
+  Variable *DestLoad = Load->getDest();
+  Type Ty = DestLoad->getType();
+  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
+  InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
+  lowerAssign(Assign);
+}
+
+template <class Machine> void TargetX86Base<Machine>::doAddressOptLoad() {
+  Inst *Inst = Context.getCur();
+  Variable *Dest = Inst->getDest();
+  Operand *Addr = Inst->getSrc(0);
+  Variable *Index = nullptr;
+  uint16_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  // Vanilla ICE load instructions should not use the segment registers,
+  // and computeAddressOpt only works at the level of Variables and Constants,
+  // not other OperandX8632Mem, so there should be no mention of segment
+  // registers there either.
+  const OperandX8632Mem::SegmentRegisters SegmentReg =
+      OperandX8632Mem::DefaultSegment;
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Inst->setDeleted();
+    Constant *OffsetOp = Ctx->getConstantInt32(Offset);
+    Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
+                                   Shift, SegmentReg);
+    Context.insert(InstLoad::create(Func, Dest, Addr));
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::randomlyInsertNop(float Probability) {
+  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
+  if (RNG.getTrueWithProbability(Probability)) {
+    _nop(RNG(Traits::X86_NUM_NOP_VARIANTS));
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerPhi(const InstPhi * /*Inst*/) {
+  Func->setError("Phi found in regular instruction list");
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerRet(const InstRet *Inst) {
+  Variable *Reg = nullptr;
+  if (Inst->hasRetValue()) {
+    Operand *Src0 = legalize(Inst->getRetValue());
+    if (Src0->getType() == IceType_i64) {
+      Variable *eax = legalizeToVar(loOperand(Src0), RegX8632::Reg_eax);
+      Variable *edx = legalizeToVar(hiOperand(Src0), RegX8632::Reg_edx);
+      Reg = eax;
+      Context.insert(InstFakeUse::create(Func, edx));
+    } else if (isScalarFloatingType(Src0->getType())) {
+      _fld(Src0);
+    } else if (isVectorType(Src0->getType())) {
+      Reg = legalizeToVar(Src0, RegX8632::Reg_xmm0);
+    } else {
+      _mov(Reg, Src0, RegX8632::Reg_eax);
+    }
+  }
+  // Add a ret instruction even if sandboxing is enabled, because
+  // addEpilog explicitly looks for a ret instruction as a marker for
+  // where to insert the frame removal instructions.
+  _ret(Reg);
+  // Add a fake use of esp to make sure esp stays alive for the entire
+  // function.  Otherwise post-call esp adjustments get dead-code
+  // eliminated.  TODO: Are there more places where the fake use
+  // should be inserted?  E.g. "void f(int n){while(1) g(n);}" may not
+  // have a ret instruction.
+  Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
+  Context.insert(InstFakeUse::create(Func, esp));
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerSelect(const InstSelect *Inst) {
+  Variable *Dest = Inst->getDest();
+  Type DestTy = Dest->getType();
+  Operand *SrcT = Inst->getTrueOperand();
+  Operand *SrcF = Inst->getFalseOperand();
+  Operand *Condition = Inst->getCondition();
+
+  if (isVectorType(DestTy)) {
+    Type SrcTy = SrcT->getType();
+    Variable *T = makeReg(SrcTy);
+    Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
+    Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
+    if (InstructionSet >= Machine::SSE4_1) {
+      // TODO(wala): If the condition operand is a constant, use blendps
+      // or pblendw.
+      //
+      // Use blendvps or pblendvb to implement select.
+      if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
+          SrcTy == IceType_v4f32) {
+        Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+        Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
+        _movp(xmm0, ConditionRM);
+        _psll(xmm0, Ctx->getConstantInt8(31));
+        _movp(T, SrcFRM);
+        _blendvps(T, SrcTRM, xmm0);
+        _movp(Dest, T);
+      } else {
+        assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
+        Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
+                                                              : IceType_v16i8;
+        Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
+        lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
+        _movp(T, SrcFRM);
+        _pblendvb(T, SrcTRM, xmm0);
+        _movp(Dest, T);
+      }
+      return;
+    }
+    // Lower select without Machine::SSE4.1:
+    // a=d?b:c ==>
+    //   if elementtype(d) != i1:
+    //      d=sext(d);
+    //   a=(b&d)|(c&~d);
+    Variable *T2 = makeReg(SrcTy);
+    // Sign extend the condition operand if applicable.
+    if (SrcTy == IceType_v4f32) {
+      // The sext operation takes only integer arguments.
+      Variable *T3 = Func->makeVariable(IceType_v4i32);
+      lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
+      _movp(T, T3);
+    } else if (typeElementType(SrcTy) != IceType_i1) {
+      lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
+    } else {
+      Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
+      _movp(T, ConditionRM);
+    }
+    _movp(T2, T);
+    _pand(T, SrcTRM);
+    _pandn(T2, SrcFRM);
+    _por(T, T2);
+    _movp(Dest, T);
+
+    return;
+  }
+
+  CondX86::BrCond Cond = CondX86::Br_ne;
+  Operand *CmpOpnd0 = nullptr;
+  Operand *CmpOpnd1 = nullptr;
+  // Handle folding opportunities.
+  if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32: {
+      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
+      Cond = Traits::getIcmp32Mapping(Cmp->getCondition());
+      CmpOpnd1 = legalize(Producer->getSrc(1));
+      CmpOpnd0 = legalizeSrc0ForCmp(Producer->getSrc(0), CmpOpnd1);
+    } break;
+    }
+  }
+  if (CmpOpnd0 == nullptr) {
+    CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem);
+    CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
+  }
+  assert(CmpOpnd0);
+  assert(CmpOpnd1);
+
+  _cmp(CmpOpnd0, CmpOpnd1);
+  if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
+    // The cmov instruction doesn't allow 8-bit or FP operands, so
+    // we need explicit control flow.
+    // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
+    SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
+    _mov(Dest, SrcT);
+    _br(Cond, Label);
+    SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
+    _mov_nonkillable(Dest, SrcF);
+    Context.insert(Label);
+    return;
+  }
+  // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
+  // But if SrcT is immediate, we might be able to do better, as
+  // the cmov instruction doesn't allow an immediate operand:
+  // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
+  if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
+    std::swap(SrcT, SrcF);
+    Cond = InstX8632::getOppositeCondition(Cond);
+  }
+  if (DestTy == IceType_i64) {
+    // Set the low portion.
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *TLo = nullptr;
+    Operand *SrcFLo = legalize(loOperand(SrcF));
+    _mov(TLo, SrcFLo);
+    Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Mem);
+    _cmov(TLo, SrcTLo, Cond);
+    _mov(DestLo, TLo);
+    // Set the high portion.
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *THi = nullptr;
+    Operand *SrcFHi = legalize(hiOperand(SrcF));
+    _mov(THi, SrcFHi);
+    Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Mem);
+    _cmov(THi, SrcTHi, Cond);
+    _mov(DestHi, THi);
+    return;
+  }
+
+  assert(DestTy == IceType_i16 || DestTy == IceType_i32);
+  Variable *T = nullptr;
+  SrcF = legalize(SrcF);
+  _mov(T, SrcF);
+  SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
+  _cmov(T, SrcT, Cond);
+  _mov(Dest, T);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerStore(const InstStore *Inst) {
+  Operand *Value = Inst->getData();
+  Operand *Addr = Inst->getAddr();
+  OperandX8632Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
+  Type Ty = NewAddr->getType();
+
+  if (Ty == IceType_i64) {
+    Value = legalize(Value);
+    Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
+    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
+    _store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));
+    _store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));
+  } else if (isVectorType(Ty)) {
+    _storep(legalizeToVar(Value), NewAddr);
+  } else {
+    Value = legalize(Value, Legal_Reg | Legal_Imm);
+    _store(Value, NewAddr);
+  }
+}
+
+template <class Machine> void TargetX86Base<Machine>::doAddressOptStore() {
+  InstStore *Inst = llvm::cast<InstStore>(Context.getCur());
+  Operand *Data = Inst->getData();
+  Operand *Addr = Inst->getAddr();
+  Variable *Index = nullptr;
+  uint16_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  // Vanilla ICE store instructions should not use the segment registers,
+  // and computeAddressOpt only works at the level of Variables and Constants,
+  // not other OperandX8632Mem, so there should be no mention of segment
+  // registers there either.
+  const OperandX8632Mem::SegmentRegisters SegmentReg =
+      OperandX8632Mem::DefaultSegment;
+  computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Inst->setDeleted();
+    Constant *OffsetOp = Ctx->getConstantInt32(Offset);
+    Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
+                                   Shift, SegmentReg);
+    InstStore *NewStore = InstStore::create(Func, Data, Addr);
+    if (Inst->getDest())
+      NewStore->setRmwBeacon(Inst->getRmwBeacon());
+    Context.insert(NewStore);
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerSwitch(const InstSwitch *Inst) {
+  // This implements the most naive possible lowering.
+  // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
+  Operand *Src0 = Inst->getComparison();
+  SizeT NumCases = Inst->getNumCases();
+  if (Src0->getType() == IceType_i64) {
+    Src0 = legalize(Src0); // get Base/Index into physical registers
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    if (NumCases >= 2) {
+      Src0Lo = legalizeToVar(Src0Lo);
+      Src0Hi = legalizeToVar(Src0Hi);
+    } else {
+      Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
+      Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
+    }
+    for (SizeT I = 0; I < NumCases; ++I) {
+      Constant *ValueLo = Ctx->getConstantInt32(Inst->getValue(I));
+      Constant *ValueHi = Ctx->getConstantInt32(Inst->getValue(I) >> 32);
+      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      _cmp(Src0Lo, ValueLo);
+      _br(CondX86::Br_ne, Label);
+      _cmp(Src0Hi, ValueHi);
+      _br(CondX86::Br_e, Inst->getLabel(I));
+      Context.insert(Label);
+    }
+    _br(Inst->getLabelDefault());
+    return;
+  }
+  // OK, we'll be slightly less naive by forcing Src into a physical
+  // register if there are 2 or more uses.
+  if (NumCases >= 2)
+    Src0 = legalizeToVar(Src0);
+  else
+    Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
+  for (SizeT I = 0; I < NumCases; ++I) {
+    Constant *Value = Ctx->getConstantInt32(Inst->getValue(I));
+    _cmp(Src0, Value);
+    _br(CondX86::Br_e, Inst->getLabel(I));
+  }
+
+  _br(Inst->getLabelDefault());
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                                 Variable *Dest, Operand *Src0,
+                                                 Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt32(I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy);
+    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
+    Variable *Op1 = Func->makeVariable(ElementTy);
+    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy);
+    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty);
+    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
+    T = DestT;
+  }
+
+  lowerAssign(InstAssign::create(Func, Dest, T));
+}
+
+// The following pattern occurs often in lowered C and C++ code:
+//
+//   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+//   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+//
+// We can eliminate the sext operation by copying the result of pcmpeqd,
+// pcmpgtd, or cmpps (which produce sign extended results) to the result
+// of the sext operation.
+template <class Machine>
+void TargetX86Base<Machine>::eliminateNextVectorSextInstruction(
+    Variable *SignExtendedResult) {
+  if (InstCast *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      NextCast->setDeleted();
+      _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
+      // Skip over the instruction.
+      Context.advanceNext();
+    }
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerUnreachable(
+    const InstUnreachable * /*Inst*/) {
+  _ud2();
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerRMW(const InstX8632FakeRMW *RMW) {
+  // If the beacon variable's live range does not end in this
+  // instruction, then it must end in the modified Store instruction
+  // that follows.  This means that the original Store instruction is
+  // still there, either because the value being stored is used beyond
+  // the Store instruction, or because dead code elimination did not
+  // happen.  In either case, we cancel RMW lowering (and the caller
+  // deletes the RMW instruction).
+  if (!RMW->isLastUse(RMW->getBeacon()))
+    return;
+  Operand *Src = RMW->getData();
+  Type Ty = Src->getType();
+  OperandX8632Mem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
+  if (Ty == IceType_i64) {
+    Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
+    Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
+    OperandX8632Mem *AddrLo = llvm::cast<OperandX8632Mem>(loOperand(Addr));
+    OperandX8632Mem *AddrHi = llvm::cast<OperandX8632Mem>(hiOperand(Addr));
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      _add_rmw(AddrLo, SrcLo);
+      _adc_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Sub:
+      _sub_rmw(AddrLo, SrcLo);
+      _sbb_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::And:
+      _and_rmw(AddrLo, SrcLo);
+      _and_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Or:
+      _or_rmw(AddrLo, SrcLo);
+      _or_rmw(AddrHi, SrcHi);
+      return;
+    case InstArithmetic::Xor:
+      _xor_rmw(AddrLo, SrcLo);
+      _xor_rmw(AddrHi, SrcHi);
+      return;
+    }
+  } else {
+    // i8, i16, i32
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _add_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Sub:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _sub_rmw(Addr, Src);
+      return;
+    case InstArithmetic::And:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _and_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Or:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _or_rmw(Addr, Src);
+      return;
+    case InstArithmetic::Xor:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _xor_rmw(Addr, Src);
+      return;
+    }
+  }
+  llvm::report_fatal_error("Couldn't lower RMW instruction");
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::lowerOther(const Inst *Instr) {
+  if (const auto *RMW = llvm::dyn_cast<InstX8632FakeRMW>(Instr)) {
+    lowerRMW(RMW);
+  } else {
+    TargetLowering::lowerOther(Instr);
+  }
+}
+
+// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
+// preserve integrity of liveness analysis.  Undef values are also
+// turned into zeroes, since loOperand() and hiOperand() don't expect
+// Undef input.
+template <class Machine> void TargetX86Base<Machine>::prelowerPhis() {
+  // Pause constant blinding or pooling, blinding or pooling will be done later
+  // during phi lowering assignments
+  BoolFlagSaver B(RandomizationPoolingPaused, true);
+
+  CfgNode *Node = Context.getNode();
+  for (Inst &I : Node->getPhis()) {
+    auto Phi = llvm::dyn_cast<InstPhi>(&I);
+    if (Phi->isDeleted())
+      continue;
+    Variable *Dest = Phi->getDest();
+    if (Dest->getType() == IceType_i64) {
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      InstPhi *PhiLo = InstPhi::create(Func, Phi->getSrcSize(), DestLo);
+      InstPhi *PhiHi = InstPhi::create(Func, Phi->getSrcSize(), DestHi);
+      for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
+        Operand *Src = Phi->getSrc(I);
+        CfgNode *Label = Phi->getLabel(I);
+        if (llvm::isa<ConstantUndef>(Src))
+          Src = Ctx->getConstantZero(Dest->getType());
+        PhiLo->addArgument(loOperand(Src), Label);
+        PhiHi->addArgument(hiOperand(Src), Label);
+      }
+      Node->getPhis().push_back(PhiLo);
+      Node->getPhis().push_back(PhiHi);
+      Phi->setDeleted();
+    }
+  }
+}
+
+bool isMemoryOperand(const Operand *Opnd) {
+  if (const auto Var = llvm::dyn_cast<Variable>(Opnd))
+    return !Var->hasReg();
+  // We treat vector undef values the same as a memory operand,
+  // because they do in fact need a register to materialize the vector
+  // of zeroes into.
+  if (llvm::isa<ConstantUndef>(Opnd))
+    return isScalarFloatingType(Opnd->getType()) ||
+           isVectorType(Opnd->getType());
+  if (llvm::isa<Constant>(Opnd))
+    return isScalarFloatingType(Opnd->getType());
+  return true;
+}
+
+// Lower the pre-ordered list of assignments into mov instructions.
+// Also has to do some ad-hoc register allocation as necessary.
+template <class Machine>
+void TargetX86Base<Machine>::lowerPhiAssignments(
+    CfgNode *Node, const AssignList &Assignments) {
+  // Check that this is a properly initialized shell of a node.
+  assert(Node->getOutEdges().size() == 1);
+  assert(Node->getInsts().empty());
+  assert(Node->getPhis().empty());
+  CfgNode *Succ = Node->getOutEdges().front();
+  getContext().init(Node);
+  // Register set setup similar to regAlloc().
+  RegSetMask RegInclude = RegSet_All;
+  RegSetMask RegExclude = RegSet_StackPointer;
+  if (hasFramePointer())
+    RegExclude |= RegSet_FramePointer;
+  llvm::SmallBitVector Available = getRegisterSet(RegInclude, RegExclude);
+  bool NeedsRegs = false;
+  // Initialize the set of available registers to the set of what is
+  // available (not live) at the beginning of the successor block,
+  // minus all registers used as Dest operands in the Assignments.  To
+  // do this, we start off assuming all registers are available, then
+  // iterate through the Assignments and remove Dest registers.
+  // During this iteration, we also determine whether we will actually
+  // need any extra registers for memory-to-memory copies.  If so, we
+  // do the actual work of removing the live-in registers from the
+  // set.  TODO(stichnot): This work is being repeated for every split
+  // edge to the successor, so consider updating LiveIn just once
+  // after all the edges are split.
+  for (const Inst &I : Assignments) {
+    Variable *Dest = I.getDest();
+    if (Dest->hasReg()) {
+      Available[Dest->getRegNum()] = false;
+    } else if (isMemoryOperand(I.getSrc(0))) {
+      NeedsRegs = true; // Src and Dest are both in memory
+    }
+  }
+  if (NeedsRegs) {
+    LivenessBV &LiveIn = Func->getLiveness()->getLiveIn(Succ);
+    for (int i = LiveIn.find_first(); i != -1; i = LiveIn.find_next(i)) {
+      Variable *Var = Func->getLiveness()->getVariable(i, Succ);
+      if (Var->hasReg())
+        Available[Var->getRegNum()] = false;
+    }
+  }
+  // Iterate backwards through the Assignments.  After lowering each
+  // assignment, add Dest to the set of available registers, and
+  // remove Src from the set of available registers.  Iteration is
+  // done backwards to enable incremental updates of the available
+  // register set, and the lowered instruction numbers may be out of
+  // order, but that can be worked around by renumbering the block
+  // afterwards if necessary.
+  for (const Inst &I : reverse_range(Assignments)) {
+    Context.rewind();
+    auto Assign = llvm::dyn_cast<InstAssign>(&I);
+    Variable *Dest = Assign->getDest();
+
+    // If the source operand is ConstantUndef, do not legalize it.
+    // In function test_split_undef_int_vec, the advanced phi
+    // lowering process will find an assignment of undefined
+    // vector. This vector, as the Src here, will crash if it
+    // go through legalize(). legalize() will create new variable
+    // with makeVectorOfZeros(), but this new variable will be
+    // assigned a stack slot. This will fail the assertion in
+    // IceInstX8632.cpp:789, as XmmEmitterRegOp() complain:
+    // Var->hasReg() fails. Note this failure is irrelevant to
+    // randomization or pooling of constants.
+    // So, we do not call legalize() to add pool label for the
+    // src operands of phi assignment instructions.
+    // Instead, we manually add pool label for constant float and
+    // constant double values here.
+    // Note going through legalize() does not affect the testing
+    // results of SPEC2K and xtests.
+    Operand *Src = Assign->getSrc(0);
+    if (!llvm::isa<ConstantUndef>(Assign->getSrc(0))) {
+      Src = legalize(Src);
+    }
+
+    Variable *SrcVar = llvm::dyn_cast<Variable>(Src);
+    // Use normal assignment lowering, except lower mem=mem specially
+    // so we can register-allocate at the same time.
+    if (!isMemoryOperand(Dest) || !isMemoryOperand(Src)) {
+      lowerAssign(Assign);
+    } else {
+      assert(Dest->getType() == Src->getType());
+      const llvm::SmallBitVector &RegsForType =
+          getRegisterSetForType(Dest->getType());
+      llvm::SmallBitVector AvailRegsForType = RegsForType & Available;
+      Variable *SpillLoc = nullptr;
+      Variable *Preg = nullptr;
+      // TODO(stichnot): Opportunity for register randomization.
+      int32_t RegNum = AvailRegsForType.find_first();
+      bool IsVector = isVectorType(Dest->getType());
+      bool NeedSpill = (RegNum == -1);
+      if (NeedSpill) {
+        // Pick some register to spill and update RegNum.
+        // TODO(stichnot): Opportunity for register randomization.
+        RegNum = RegsForType.find_first();
+        Preg = getPhysicalRegister(RegNum, Dest->getType());
+        SpillLoc = Func->makeVariable(Dest->getType());
+        // Create a fake def of the physical register to avoid
+        // liveness inconsistency problems during late-stage liveness
+        // analysis (e.g. asm-verbose mode).
+        Context.insert(InstFakeDef::create(Func, Preg));
+        if (IsVector)
+          _movp(SpillLoc, Preg);
+        else
+          _mov(SpillLoc, Preg);
+      }
+      assert(RegNum >= 0);
+      if (llvm::isa<ConstantUndef>(Src))
+        // Materialize an actual constant instead of undef.  RegNum is
+        // passed in for vector types because undef vectors are
+        // lowered to vector register of zeroes.
+        Src =
+            legalize(Src, Legal_All, IsVector ? RegNum : Variable::NoRegister);
+      Variable *Tmp = makeReg(Dest->getType(), RegNum);
+      if (IsVector) {
+        _movp(Tmp, Src);
+        _movp(Dest, Tmp);
+      } else {
+        _mov(Tmp, Src);
+        _mov(Dest, Tmp);
+      }
+      if (NeedSpill) {
+        // Restore the spilled register.
+        if (IsVector)
+          _movp(Preg, SpillLoc);
+        else
+          _mov(Preg, SpillLoc);
+        // Create a fake use of the physical register to keep it live
+        // for late-stage liveness analysis (e.g. asm-verbose mode).
+        Context.insert(InstFakeUse::create(Func, Preg));
+      }
+    }
+    // Update register availability before moving to the previous
+    // instruction on the Assignments list.
+    if (Dest->hasReg())
+      Available[Dest->getRegNum()] = true;
+    if (SrcVar && SrcVar->hasReg())
+      Available[SrcVar->getRegNum()] = false;
+  }
+
+  // Add the terminator branch instruction to the end.
+  Context.setInsertPoint(Context.getEnd());
+  _br(Succ);
+}
+
+// There is no support for loading or emitting vector constants, so the
+// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
+// etc. are initialized with register operations.
+//
+// TODO(wala): Add limited support for vector constants so that
+// complex initialization in registers is unnecessary.
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeVectorOfZeros(Type Ty, int32_t RegNum) {
+  Variable *Reg = makeReg(Ty, RegNum);
+  // Insert a FakeDef, since otherwise the live range of Reg might
+  // be overestimated.
+  Context.insert(InstFakeDef::create(Func, Reg));
+  _pxor(Reg, Reg);
+  return Reg;
+}
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeVectorOfMinusOnes(Type Ty,
+                                                        int32_t RegNum) {
+  Variable *MinusOnes = makeReg(Ty, RegNum);
+  // Insert a FakeDef so the live range of MinusOnes is not overestimated.
+  Context.insert(InstFakeDef::create(Func, MinusOnes));
+  _pcmpeq(MinusOnes, MinusOnes);
+  return MinusOnes;
+}
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeVectorOfOnes(Type Ty, int32_t RegNum) {
+  Variable *Dest = makeVectorOfZeros(Ty, RegNum);
+  Variable *MinusOne = makeVectorOfMinusOnes(Ty);
+  _psub(Dest, MinusOne);
+  return Dest;
+}
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeVectorOfHighOrderBits(Type Ty,
+                                                            int32_t RegNum) {
+  assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
+         Ty == IceType_v16i8);
+  if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
+    Variable *Reg = makeVectorOfOnes(Ty, RegNum);
+    SizeT Shift =
+        typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
+    _psll(Reg, Ctx->getConstantInt8(Shift));
+    return Reg;
+  } else {
+    // SSE has no left shift operation for vectors of 8 bit integers.
+    const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
+    Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
+    Variable *Reg = makeReg(Ty, RegNum);
+    _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
+    _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
+    return Reg;
+  }
+}
+
+// Construct a mask in a register that can be and'ed with a
+// floating-point value to mask off its sign bit.  The value will be
+// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
+// for f64.  Construct it as vector of ones logically right shifted
+// one bit.  TODO(stichnot): Fix the wala TODO above, to represent
+// vector constants in memory.
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeVectorOfFabsMask(Type Ty,
+                                                       int32_t RegNum) {
+  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
+  _psrl(Reg, Ctx->getConstantInt8(1));
+  return Reg;
+}
+
+template <class Machine>
+OperandX8632Mem *
+TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                                     uint32_t Offset) {
+  // Ensure that Loc is a stack slot.
+  assert(Slot->getWeight().isZero());
+  assert(Slot->getRegNum() == Variable::NoRegister);
+  // Compute the location of Loc in memory.
+  // TODO(wala,stichnot): lea should not be required.  The address of
+  // the stack slot is known at compile time (although not until after
+  // addProlog()).
+  const Type PointerType = IceType_i32;
+  Variable *Loc = makeReg(PointerType);
+  _lea(Loc, Slot);
+  Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
+  return OperandX8632Mem::create(Func, Ty, Loc, ConstantOffset);
+}
+
+// Helper for legalize() to emit the right code to lower an operand to a
+// register of the appropriate type.
+template <class Machine>
+Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) {
+  Type Ty = Src->getType();
+  Variable *Reg = makeReg(Ty, RegNum);
+  if (isVectorType(Ty)) {
+    _movp(Reg, Src);
+  } else {
+    _mov(Reg, Src);
+  }
+  return Reg;
+}
+
+template <class Machine>
+Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed,
+                                          int32_t RegNum) {
+  Type Ty = From->getType();
+  // Assert that a physical register is allowed.  To date, all calls
+  // to legalize() allow a physical register.  If a physical register
+  // needs to be explicitly disallowed, then new code will need to be
+  // written to force a spill.
+  assert(Allowed & Legal_Reg);
+  // If we're asking for a specific physical register, make sure we're
+  // not allowing any other operand kinds.  (This could be future
+  // work, e.g. allow the shl shift amount to be either an immediate
+  // or in ecx.)
+  assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg);
+
+  if (auto Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
+    // Before doing anything with a Mem operand, we need to ensure
+    // that the Base and Index components are in physical registers.
+    Variable *Base = Mem->getBase();
+    Variable *Index = Mem->getIndex();
+    Variable *RegBase = nullptr;
+    Variable *RegIndex = nullptr;
+    if (Base) {
+      RegBase = legalizeToVar(Base);
+    }
+    if (Index) {
+      RegIndex = legalizeToVar(Index);
+    }
+    if (Base != RegBase || Index != RegIndex) {
+      Mem =
+          OperandX8632Mem::create(Func, Ty, RegBase, Mem->getOffset(), RegIndex,
+                                  Mem->getShift(), Mem->getSegmentRegister());
+    }
+
+    // For all Memory Operands, we do randomization/pooling here
+    From = randomizeOrPoolImmediate(Mem);
+
+    if (!(Allowed & Legal_Mem)) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+  if (auto *Const = llvm::dyn_cast<Constant>(From)) {
+    if (llvm::isa<ConstantUndef>(Const)) {
+      // Lower undefs to zero.  Another option is to lower undefs to an
+      // uninitialized register; however, using an uninitialized register
+      // results in less predictable code.
+      //
+      // If in the future the implementation is changed to lower undef
+      // values to uninitialized registers, a FakeDef will be needed:
+      //     Context.insert(InstFakeDef::create(Func, Reg));
+      // This is in order to ensure that the live range of Reg is not
+      // overestimated.  If the constant being lowered is a 64 bit value,
+      // then the result should be split and the lo and hi components will
+      // need to go in uninitialized registers.
+      if (isVectorType(Ty))
+        return makeVectorOfZeros(Ty, RegNum);
+      Const = Ctx->getConstantZero(Ty);
+      From = Const;
+    }
+    // There should be no constants of vector type (other than undef).
+    assert(!isVectorType(Ty));
+
+    // If the operand is an 32 bit constant integer, we should check
+    // whether we need to randomize it or pool it.
+    if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
+      Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
+      if (NewConst != Const) {
+        return NewConst;
+      }
+    }
+
+    // Convert a scalar floating point constant into an explicit
+    // memory operand.
+    if (isScalarFloatingType(Ty)) {
+      Variable *Base = nullptr;
+      std::string Buffer;
+      llvm::raw_string_ostream StrBuf(Buffer);
+      llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
+      llvm::cast<Constant>(From)->setShouldBePooled(true);
+      Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
+      From = OperandX8632Mem::create(Func, Ty, Base, Offset);
+    }
+    bool NeedsReg = false;
+    if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
+      // Immediate specifically not allowed
+      NeedsReg = true;
+    if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
+      // On x86, FP constants are lowered to mem operands.
+      NeedsReg = true;
+    if (NeedsReg) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+  if (auto Var = llvm::dyn_cast<Variable>(From)) {
+    // Check if the variable is guaranteed a physical register.  This
+    // can happen either when the variable is pre-colored or when it is
+    // assigned infinite weight.
+    bool MustHaveRegister = (Var->hasReg() || Var->getWeight().isInf());
+    // We need a new physical register for the operand if:
+    //   Mem is not allowed and Var isn't guaranteed a physical
+    //   register, or
+    //   RegNum is required and Var->getRegNum() doesn't match.
+    if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
+        (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+  llvm_unreachable("Unhandled operand kind in legalize()");
+  return From;
+}
+
+// Provide a trivial wrapper to legalize() for this common usage.
+template <class Machine>
+Variable *TargetX86Base<Machine>::legalizeToVar(Operand *From, int32_t RegNum) {
+  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
+}
+
+// For the cmp instruction, if Src1 is an immediate, or known to be a
+// physical register, we can allow Src0 to be a memory operand.
+// Otherwise, Src0 must be copied into a physical register.
+// (Actually, either Src0 or Src1 can be chosen for the physical
+// register, but unfortunately we have to commit to one or the other
+// before register allocation.)
+template <class Machine>
+Operand *TargetX86Base<Machine>::legalizeSrc0ForCmp(Operand *Src0,
+                                                    Operand *Src1) {
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
+}
+
+template <class Machine>
+OperandX8632Mem *TargetX86Base<Machine>::formMemoryOperand(Operand *Opnd,
+                                                           Type Ty,
+                                                           bool DoLegalize) {
+  OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Opnd);
+  // It may be the case that address mode optimization already creates
+  // an OperandX8632Mem, so in that case it wouldn't need another level
+  // of transformation.
+  if (!Mem) {
+    Variable *Base = llvm::dyn_cast<Variable>(Opnd);
+    Constant *Offset = llvm::dyn_cast<Constant>(Opnd);
+    assert(Base || Offset);
+    if (Offset) {
+      // During memory operand building, we do not blind or pool
+      // the constant offset, we will work on the whole memory
+      // operand later as one entity later, this save one instruction.
+      // By turning blinding and pooling off, we guarantee
+      // legalize(Offset) will return a constant*.
+      {
+        BoolFlagSaver B(RandomizationPoolingPaused, true);
+
+        Offset = llvm::cast<Constant>(legalize(Offset));
+      }
+
+      assert(llvm::isa<ConstantInteger32>(Offset) ||
+             llvm::isa<ConstantRelocatable>(Offset));
+    }
+    Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
+  }
+  // Do legalization, which contains randomization/pooling
+  // or do randomization/pooling.
+  return llvm::cast<OperandX8632Mem>(
+      DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem));
+}
+
+template <class Machine>
+Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {
+  // There aren't any 64-bit integer registers for x86-32.
+  assert(Type != IceType_i64);
+  Variable *Reg = Func->makeVariable(Type);
+  if (RegNum == Variable::NoRegister)
+    Reg->setWeightInfinite();
+  else
+    Reg->setRegNum(RegNum);
+  return Reg;
+}
+
+template <class Machine> void TargetX86Base<Machine>::postLower() {
+  if (Ctx->getFlags().getOptLevel() == Opt_m1)
+    return;
+  inferTwoAddress();
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::makeRandomRegisterPermutation(
+    llvm::SmallVectorImpl<int32_t> &Permutation,
+    const llvm::SmallBitVector &ExcludeRegisters) const {
+  // TODO(stichnot): Declaring Permutation this way loses type/size
+  // information.  Fix this in conjunction with the caller-side TODO.
+  assert(Permutation.size() >= RegX8632::Reg_NUM);
+  // Expected upper bound on the number of registers in a single
+  // equivalence class.  For x86-32, this would comprise the 8 XMM
+  // registers.  This is for performance, not correctness.
+  static const unsigned MaxEquivalenceClassSize = 8;
+  typedef llvm::SmallVector<int32_t, MaxEquivalenceClassSize> RegisterList;
+  typedef std::map<uint32_t, RegisterList> EquivalenceClassMap;
+  EquivalenceClassMap EquivalenceClasses;
+  SizeT NumShuffled = 0, NumPreserved = 0;
+
+// Build up the equivalence classes of registers by looking at the
+// register properties as well as whether the registers should be
+// explicitly excluded from shuffling.
+#define X(val, encode, name, name16, name8, scratch, preserved, stackptr,      \
+          frameptr, isI8, isInt, isFP)                                         \
+  if (ExcludeRegisters[RegX8632::val]) {                                       \
+    /* val stays the same in the resulting permutation. */                     \
+    Permutation[RegX8632::val] = RegX8632::val;                                \
+    ++NumPreserved;                                                            \
+  } else {                                                                     \
+    const uint32_t Index = (scratch << 0) | (preserved << 1) | (isI8 << 2) |   \
+                           (isInt << 3) | (isFP << 4);                         \
+    /* val is assigned to an equivalence class based on its properties. */     \
+    EquivalenceClasses[Index].push_back(RegX8632::val);                        \
+  }
+  REGX8632_TABLE
+#undef X
+
+  RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
+
+  // Shuffle the resulting equivalence classes.
+  for (auto I : EquivalenceClasses) {
+    const RegisterList &List = I.second;
+    RegisterList Shuffled(List);
+    RandomShuffle(Shuffled.begin(), Shuffled.end(), RNG);
+    for (size_t SI = 0, SE = Shuffled.size(); SI < SE; ++SI) {
+      Permutation[List[SI]] = Shuffled[SI];
+      ++NumShuffled;
+    }
+  }
+
+  assert(NumShuffled + NumPreserved == RegX8632::Reg_NUM);
+
+  if (Func->isVerbose(IceV_Random)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << "Register equivalence classes:\n";
+    for (auto I : EquivalenceClasses) {
+      Str << "{";
+      const RegisterList &List = I.second;
+      bool First = true;
+      for (int32_t Register : List) {
+        if (!First)
+          Str << " ";
+        First = false;
+        Str << getRegName(Register, IceType_i32);
+      }
+      Str << "}\n";
+    }
+  }
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantInteger32 *C) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << getConstantPrefix() << C->getValue();
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantInteger64 *) const {
+  llvm::report_fatal_error("Not expecting to emit 64-bit integers");
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantFloat *C) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  C->emitPoolLabel(Str);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantDouble *C) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  C->emitPoolLabel(Str);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantUndef *) const {
+  llvm::report_fatal_error("undef value encountered by emitter.");
+}
+
+// Randomize or pool an Immediate.
+template <class Machine>
+Operand *TargetX86Base<Machine>::randomizeOrPoolImmediate(Constant *Immediate,
+                                                          int32_t RegNum) {
+  assert(llvm::isa<ConstantInteger32>(Immediate) ||
+         llvm::isa<ConstantRelocatable>(Immediate));
+  if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
+      RandomizationPoolingPaused == true) {
+    // Immediates randomization/pooling off or paused
+    return Immediate;
+  }
+  if (Immediate->shouldBeRandomizedOrPooled(Ctx)) {
+    Ctx->statsUpdateRPImms();
+    if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
+        RPI_Randomize) {
+      // blind the constant
+      // FROM:
+      //  imm
+      // TO:
+      //  insert: mov imm+cookie, Reg
+      //  insert: lea -cookie[Reg], Reg
+      //  => Reg
+      // If we have already assigned a phy register, we must come from
+      // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse
+      // the assigned register as this assignment is that start of its use-def
+      // chain. So we add RegNum argument here.
+      // Note we use 'lea' instruction instead of 'xor' to avoid affecting
+      // the flags.
+      Variable *Reg = makeReg(IceType_i32, RegNum);
+      ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate);
+      uint32_t Value = Integer->getValue();
+      uint32_t Cookie = Ctx->getRandomizationCookie();
+      _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
+      Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
+      _lea(Reg,
+           OperandX8632Mem::create(Func, IceType_i32, Reg, Offset, nullptr, 0));
+      // make sure liveness analysis won't kill this variable, otherwise a
+      // liveness
+      // assertion will be triggered.
+      _set_dest_nonkillable();
+      if (Immediate->getType() != IceType_i32) {
+        Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
+        _mov(TruncReg, Reg);
+        return TruncReg;
+      }
+      return Reg;
+    }
+    if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) {
+      // pool the constant
+      // FROM:
+      //  imm
+      // TO:
+      //  insert: mov $label, Reg
+      //  => Reg
+      assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
+      Immediate->setShouldBePooled(true);
+      // if we have already assigned a phy register, we must come from
+      // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse
+      // the assigned register as this assignment is that start of its use-def
+      // chain. So we add RegNum argument here.
+      Variable *Reg = makeReg(Immediate->getType(), RegNum);
+      IceString Label;
+      llvm::raw_string_ostream Label_stream(Label);
+      Immediate->emitPoolLabel(Label_stream);
+      const RelocOffsetT Offset = 0;
+      const bool SuppressMangling = true;
+      Constant *Symbol =
+          Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling);
+      OperandX8632Mem *MemOperand =
+          OperandX8632Mem::create(Func, Immediate->getType(), nullptr, Symbol);
+      _mov(Reg, MemOperand);
+      return Reg;
+    }
+    assert("Unsupported -randomize-pool-immediates option" && false);
+  }
+  // the constant Immediate is not eligible for blinding/pooling
+  return Immediate;
+}
+
+template <class Machine>
+OperandX8632Mem *
+TargetX86Base<Machine>::randomizeOrPoolImmediate(OperandX8632Mem *MemOperand,
+                                                 int32_t RegNum) {
+  assert(MemOperand);
+  if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
+      RandomizationPoolingPaused == true) {
+    // immediates randomization/pooling is turned off
+    return MemOperand;
+  }
+
+  // If this memory operand is already a randommized one, we do
+  // not randomize it again.
+  if (MemOperand->getRandomized())
+    return MemOperand;
+
+  if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) {
+    if (C->shouldBeRandomizedOrPooled(Ctx)) {
+      // The offset of this mem operand should be blinded or pooled
+      Ctx->statsUpdateRPImms();
+      if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
+          RPI_Randomize) {
+        // blind the constant offset
+        // FROM:
+        //  offset[base, index, shift]
+        // TO:
+        //  insert: lea offset+cookie[base], RegTemp
+        //  => -cookie[RegTemp, index, shift]
+        uint32_t Value = llvm::dyn_cast<ConstantInteger32>(
+                             MemOperand->getOffset())->getValue();
+        uint32_t Cookie = Ctx->getRandomizationCookie();
+        Constant *Mask1 = Ctx->getConstantInt(
+            MemOperand->getOffset()->getType(), Cookie + Value);
+        Constant *Mask2 =
+            Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
+
+        OperandX8632Mem *TempMemOperand = OperandX8632Mem::create(
+            Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
+        // If we have already assigned a physical register, we must come from
+        // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
+        // the assigned register as this assignment is that start of its use-def
+        // chain. So we add RegNum argument here.
+        Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
+        _lea(RegTemp, TempMemOperand);
+        // As source operand doesn't use the dstreg, we don't need to add
+        // _set_dest_nonkillable().
+        // But if we use the same Dest Reg, that is, with RegNum
+        // assigned, we should add this _set_dest_nonkillable()
+        if (RegNum != Variable::NoRegister)
+          _set_dest_nonkillable();
+
+        OperandX8632Mem *NewMemOperand = OperandX8632Mem::create(
+            Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
+            MemOperand->getShift(), MemOperand->getSegmentRegister());
+
+        // Label this memory operand as randomize, so we won't randomize it
+        // again in case we call legalize() mutiple times on this memory
+        // operand.
+        NewMemOperand->setRandomized(true);
+        return NewMemOperand;
+      }
+      if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) {
+        // pool the constant offset
+        // FROM:
+        //  offset[base, index, shift]
+        // TO:
+        //  insert: mov $label, RegTemp
+        //  insert: lea [base, RegTemp], RegTemp
+        //  =>[RegTemp, index, shift]
+        assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() ==
+               RPI_Pool);
+        // Memory operand should never exist as source operands in phi
+        // lowering assignments, so there is no need to reuse any registers
+        // here. For phi lowering, we should not ask for new physical
+        // registers in general.
+        // However, if we do meet Memory Operand during phi lowering, we
+        // should not blind or pool the immediates for now.
+        if (RegNum != Variable::NoRegister)
+          return MemOperand;
+        Variable *RegTemp = makeReg(IceType_i32);
+        IceString Label;
+        llvm::raw_string_ostream Label_stream(Label);
+        MemOperand->getOffset()->emitPoolLabel(Label_stream);
+        MemOperand->getOffset()->setShouldBePooled(true);
+        const RelocOffsetT SymOffset = 0;
+        bool SuppressMangling = true;
+        Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(),
+                                               SuppressMangling);
+        OperandX8632Mem *SymbolOperand = OperandX8632Mem::create(
+            Func, MemOperand->getOffset()->getType(), nullptr, Symbol);
+        _mov(RegTemp, SymbolOperand);
+        // If we have a base variable here, we should add the lea instruction
+        // to add the value of the base variable to RegTemp. If there is no
+        // base variable, we won't need this lea instruction.
+        if (MemOperand->getBase()) {
+          OperandX8632Mem *CalculateOperand = OperandX8632Mem::create(
+              Func, MemOperand->getType(), MemOperand->getBase(), nullptr,
+              RegTemp, 0, MemOperand->getSegmentRegister());
+          _lea(RegTemp, CalculateOperand);
+          _set_dest_nonkillable();
+        }
+        OperandX8632Mem *NewMemOperand = OperandX8632Mem::create(
+            Func, MemOperand->getType(), RegTemp, nullptr,
+            MemOperand->getIndex(), MemOperand->getShift(),
+            MemOperand->getSegmentRegister());
+        return NewMemOperand;
+      }
+      assert("Unsupported -randomize-pool-immediates option" && false);
+    }
+  }
+  // the offset is not eligible for blinding or pooling, return the original
+  // mem operand
+  return MemOperand;
+}
+
+} // end of namespace X86Internal
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
commit	7e93c62d7e223b7fd9e6e0889e4b70b635589282	[log] [tgz]
author	John Porto <jpp@chromium.org>	Tue Jun 23 10:58:57 2015 -0700
committer	John Porto <jpp@chromium.org>	Tue Jun 23 10:58:57 2015 -0700
tree	0f0ef587838730d47f8aefd4d540f898f0d88d40
parent	e587d9494fa95920943fafa0f1dac470a298a251 [diff]