Subzero: lower the rest of the atomic operations.

64-bit ops are expanded via a cmpxchg8b loop.

64/32-bit and/or/xor are also expanded into a cmpxchg /
cmpxchg8b loop.

Add a cross test for atomic RMW operations and
compare and swap.

Misc: Test that atomic.is.lock.free can be optimized out if result is ignored.

TODO:
* optimize compare and swap with compare+branch further down
instruction stream.

* optimize atomic RMW when the return value is ignored
(adds a locked field to binary ops though).

* We may want to do some actual target-dependent basic
block splitting + expansion (the instructions inserted by
the expansion must reference the pre-colored registers,
etc.). Otherwise, we are currently getting by with modeling
the extended liveness of the variables used in the loops
using fake uses.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=jfb@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/362463002
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index cd5095f..c0e8c8d 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -51,9 +51,8 @@
     llvm::array_lengthof(TypeX8632Attributes);
 
 const char *InstX8632SegmentRegNames[] = {
-#define X(val, name)                                                           \
-  name,
-    SEG_REGX8632_TABLE
+#define X(val, name) name,
+  SEG_REGX8632_TABLE
 #undef X
 };
 const size_t InstX8632SegmentRegNamesSize =
@@ -140,6 +139,33 @@
   addSource(Source);
 }
 
+InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
+                                   Variable *Eax, Variable *Desired,
+                                   bool Locked)
+    : InstX8632Lockable(Func, InstX8632::Cmpxchg, 3,
+                        llvm::dyn_cast<Variable>(DestOrAddr), Locked) {
+  assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+  addSource(DestOrAddr);
+  addSource(Eax);
+  addSource(Desired);
+}
+
+InstX8632Cmpxchg8b::InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Addr,
+                                       Variable *Edx, Variable *Eax,
+                                       Variable *Ecx, Variable *Ebx,
+                                       bool Locked)
+    : InstX8632Lockable(Func, InstX8632::Cmpxchg, 5, NULL, Locked) {
+  assert(Edx->getRegNum() == TargetX8632::Reg_edx);
+  assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+  assert(Ecx->getRegNum() == TargetX8632::Reg_ecx);
+  assert(Ebx->getRegNum() == TargetX8632::Reg_ebx);
+  addSource(Addr);
+  addSource(Edx);
+  addSource(Eax);
+  addSource(Ecx);
+  addSource(Ebx);
+}
+
 InstX8632Cvt::InstX8632Cvt(Cfg *Func, Variable *Dest, Operand *Source)
     : InstX8632(Func, InstX8632::Cvt, 1, Dest) {
   addSource(Source);
@@ -284,9 +310,14 @@
 
 InstX8632Xadd::InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source,
                              bool Locked)
-    : InstX8632(Func, InstX8632::Xadd, 2, llvm::dyn_cast<Variable>(Dest)),
-      Locked(Locked) {
-  HasSideEffects = Locked;
+    : InstX8632Lockable(Func, InstX8632::Xadd, 2,
+                        llvm::dyn_cast<Variable>(Dest), Locked) {
+  addSource(Dest);
+  addSource(Source);
+}
+
+InstX8632Xchg::InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source)
+    : InstX8632(Func, InstX8632::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
   addSource(Dest);
   addSource(Source);
 }
@@ -398,6 +429,7 @@
   Str << "\n";
 }
 
+template <> const char *InstX8632Neg::Opcode = "neg";
 template <> const char *InstX8632Add::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
@@ -554,6 +586,48 @@
   dumpSources(Func);
 }
 
+void InstX8632Cmpxchg::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  if (Locked) {
+    Str << "\tlock";
+  }
+  Str << "\tcmpxchg\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(2)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmpxchg::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg." << getSrc(0)->getType() << " ";
+  dumpSources(Func);
+}
+
+void InstX8632Cmpxchg8b::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 5);
+  if (Locked) {
+    Str << "\tlock";
+  }
+  Str << "\tcmpxchg8b\t";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmpxchg8b::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg8b ";
+  dumpSources(Func);
+}
+
 void InstX8632Cvt::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -955,10 +1029,9 @@
 void InstX8632Xadd::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   if (Locked) {
-    Str << "\tlock xadd ";
-  } else {
-    Str << "\txadd\t";
+    Str << "\tlock";
   }
+  Str << "\txadd\t";
   getSrc(0)->emit(Func);
   Str << ", ";
   getSrc(1)->emit(Func);
@@ -975,6 +1048,22 @@
   dumpSources(Func);
 }
 
+void InstX8632Xchg::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\txchg\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Xchg::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getSrc(0)->getType();
+  Str << "xchg." << Ty << " ";
+  dumpSources(Func);
+}
+
 void OperandX8632::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "<OperandX8632>";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index baf072a..25beb6d 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -54,9 +54,8 @@
 public:
   enum SegmentRegisters {
     DefaultSegment = -1,
-#define X(val, name)                                                           \
-    val,
-      SEG_REGX8632_TABLE
+#define X(val, name) val,
+    SEG_REGX8632_TABLE
 #undef X
         SegReg_NUM
   };
@@ -142,6 +141,8 @@
     Br,
     Call,
     Cdq,
+    Cmpxchg,
+    Cmpxchg8b,
     Cvt,
     Div,
     Divps,
@@ -162,6 +163,7 @@
     Mul,
     Mulps,
     Mulss,
+    Neg,
     Or,
     Pop,
     Push,
@@ -183,6 +185,7 @@
     Ucomiss,
     UD2,
     Xadd,
+    Xchg,
     Xor
   };
   static const char *getWidthString(Type Ty);
@@ -328,6 +331,41 @@
   virtual ~InstX8632Call() {}
 };
 
+template <InstX8632::InstKindX8632 K>
+class InstX8632Unaryop : public InstX8632 {
+public:
+  // Create an unary-op instruction like neg.
+  // The source and dest are the same variable.
+  static InstX8632Unaryop *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX8632Unaryop>())
+        InstX8632Unaryop(Func, SrcDest);
+  }
+  virtual void emit(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    getSrc(0)->emit(Func);
+    Str << "\n";
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632Unaryop(Cfg *Func, Operand *SrcDest)
+      : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+    addSource(SrcDest);
+  }
+  InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+  InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Unaryop() {}
+  static const char *Opcode;
+};
+
 // See the definition of emitTwoAddress() for a description of
 // ShiftHack.
 void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
@@ -400,6 +438,7 @@
   static const char *Opcode;
 };
 
+typedef InstX8632Unaryop<InstX8632::Neg> InstX8632Neg;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
 typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
@@ -423,6 +462,28 @@
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
 
+// Base class for a lockable x86-32 instruction (emits a locked prefix).
+class InstX8632Lockable : public InstX8632 {
+public:
+  virtual void emit(const Cfg *Func) const = 0;
+  virtual void dump(const Cfg *Func) const;
+
+protected:
+  bool Locked;
+
+  InstX8632Lockable(Cfg *Func, InstKindX8632 Kind, SizeT Maxsrcs,
+                    Variable *Dest, bool Locked)
+      : InstX8632(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
+    // Assume that such instructions are used for Atomics and be careful
+    // with optimizations.
+    HasSideEffects = Locked;
+  }
+
+private:
+  InstX8632Lockable(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+  InstX8632Lockable &operator=(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+};
+
 // Mul instruction - unsigned multiply.
 class InstX8632Mul : public InstX8632 {
 public:
@@ -502,6 +563,57 @@
   virtual ~InstX8632Cdq() {}
 };
 
+// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
+// equals eax. If so, the ZF is set and <desired> is stored in <dest>.
+// If not, ZF is cleared and <dest> is copied to eax (or subregister).
+// <dest> can be a register or memory, while <desired> must be a register.
+// It is the user's responsiblity to mark eax with a FakeDef.
+class InstX8632Cmpxchg : public InstX8632Lockable {
+public:
+  static InstX8632Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                                  Variable *Desired, bool Locked) {
+    return new (Func->allocate<InstX8632Cmpxchg>())
+        InstX8632Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg); }
+
+private:
+  InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                   Variable *Desired, bool Locked);
+  InstX8632Cmpxchg(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpxchg &operator=(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpxchg() {}
+};
+
+// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64>
+// equals edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>.
+// If not, ZF is cleared and <m64> is copied to edx:eax.
+// The caller is responsible for inserting FakeDefs to mark edx
+// and eax as modified.
+// <m64> must be a memory operand.
+class InstX8632Cmpxchg8b : public InstX8632Lockable {
+public:
+  static InstX8632Cmpxchg8b *create(Cfg *Func, OperandX8632 *Dest,
+                                    Variable *Edx, Variable *Eax, Variable *Ecx,
+                                    Variable *Ebx, bool Locked) {
+    return new (Func->allocate<InstX8632Cmpxchg8b>())
+        InstX8632Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg8b); }
+
+private:
+  InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Dest, Variable *Edx,
+                     Variable *Eax, Variable *Ecx, Variable *Ebx, bool Locked);
+  InstX8632Cmpxchg8b(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpxchg8b &
+  operator=(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpxchg8b() {}
+};
+
 // Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i}
 // as appropriate.  s=float, d=double, i=int.  X and Y are determined
 // from dest/src types.  Sign and zero extension on the integer
@@ -861,7 +973,7 @@
 //
 // Both the dest and source are updated. The caller should then insert a
 // FakeDef to reflect the second udpate.
-class InstX8632Xadd : public InstX8632 {
+class InstX8632Xadd : public InstX8632Lockable {
 public:
   static InstX8632Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
                                bool Locked) {
@@ -873,14 +985,35 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, Xadd); }
 
 private:
-  bool Locked;
-
   InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
   InstX8632Xadd(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
   InstX8632Xadd &operator=(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
   virtual ~InstX8632Xadd() {}
 };
 
+// Exchange instruction.  Exchanges the first operand (destination
+// operand) with the second operand (source operand). At least one of
+// the operands must be a register (and the other can be reg or mem).
+// Both the Dest and Source are updated. If there is a memory operand,
+// then the instruction is automatically "locked" without the need for
+// a lock prefix.
+class InstX8632Xchg : public InstX8632 {
+public:
+  static InstX8632Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
+    return new (Func->allocate<InstX8632Xchg>())
+        InstX8632Xchg(Func, Dest, Source);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Xchg); }
+
+private:
+  InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source);
+  InstX8632Xchg(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+  InstX8632Xchg &operator=(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Xchg() {}
+};
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTX8632_H
diff --git a/src/IceIntrinsics.cpp b/src/IceIntrinsics.cpp
index 02562b5..b83513f 100644
--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp
@@ -46,7 +46,7 @@
       "nacl.atomic.fence" },
     { { { Intrinsics::AtomicFenceAll, true }, { IceType_void }, 1 },
       "nacl.atomic.fence.all" },
-    { { { Intrinsics::AtomicIsLockFree, true },
+    { { { Intrinsics::AtomicIsLockFree, false },
         { IceType_i1, IceType_i32, IceType_i32 }, 3 },
       "nacl.atomic.is.lock.free" },
 
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index f1b8c25..bf11573 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1968,7 +1968,7 @@
 
 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   switch (Instr->getIntrinsicInfo().ID) {
-  case Intrinsics::AtomicCmpxchg:
+  case Intrinsics::AtomicCmpxchg: {
     if (!Intrinsics::VerifyMemoryOrder(
              llvm::cast<ConstantInteger>(Instr->getArg(3))->getValue())) {
       Func->setError("Unexpected memory ordering (success) for AtomicCmpxchg");
@@ -1979,9 +1979,18 @@
       Func->setError("Unexpected memory ordering (failure) for AtomicCmpxchg");
       return;
     }
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled intrinsic");
+    Variable *DestPrev = Instr->getDest();
+    Operand *PtrToMem = Instr->getArg(0);
+    Operand *Expected = Instr->getArg(1);
+    Operand *Desired = Instr->getArg(2);
+    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+    // TODO(jvoung): If we peek ahead a few instructions and see how
+    // DestPrev is used (typically via another compare and branch),
+    // we may be able to optimize. If the result truly is used by a
+    // compare + branch, and the comparison is for equality, then we can
+    // optimize out the later compare, and fuse with the later branch.
     return;
+  }
   case Intrinsics::AtomicFence:
     if (!Intrinsics::VerifyMemoryOrder(
              llvm::cast<ConstantInteger>(Instr->getArg(0))->getValue())) {
@@ -2183,18 +2192,54 @@
   return;
 }
 
+void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
+                                     Operand *Expected, Operand *Desired) {
+  if (Expected->getType() == IceType_i64) {
+    // Reserve the pre-colored registers first, before adding any more
+    // infinite-weight variables from FormMemoryOperand's legalization.
+    Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+    Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+    _mov(T_eax, loOperand(Expected));
+    _mov(T_edx, hiOperand(Expected));
+    _mov(T_ebx, loOperand(Desired));
+    _mov(T_ecx, hiOperand(Desired));
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  Variable *T_eax = makeReg(Expected->getType(), Reg_eax);
+  _mov(T_eax, Expected);
+  OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+  Variable *DesiredReg = legalizeToVar(Desired);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+  _mov(DestPrev, T_eax);
+}
+
 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
                                  Operand *Ptr, Operand *Val) {
+  bool NeedsCmpxchg = false;
+  LowerBinOp Op_Lo = NULL;
+  LowerBinOp Op_Hi = NULL;
   switch (Operation) {
   default:
     Func->setError("Unknown AtomicRMW operation");
     return;
   case Intrinsics::AtomicAdd: {
     if (Dest->getType() == IceType_i64) {
-      // Do a nasty cmpxchg8b loop. Factor this into a function.
-      // TODO(jvoung): fill it in.
-      Func->setError("Unhandled AtomicRMW operation");
-      return;
+      // All the fall-through paths must set this to true, but use this
+      // for asserting.
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_add;
+      Op_Hi = &TargetX8632::_adc;
+      break;
     }
     OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
     const bool Locked = true;
@@ -2206,26 +2251,160 @@
   }
   case Intrinsics::AtomicSub: {
     if (Dest->getType() == IceType_i64) {
-      // Do a nasty cmpxchg8b loop.
-      // TODO(jvoung): fill it in.
-      Func->setError("Unhandled AtomicRMW operation");
-      return;
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_sub;
+      Op_Hi = &TargetX8632::_sbb;
+      break;
     }
-    // Generate a memory operand from Ptr.
-    // neg...
-    // Then do the same as AtomicAdd.
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled AtomicRMW operation");
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+    const bool Locked = true;
+    Variable *T = NULL;
+    _mov(T, Val);
+    _neg(T);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
     return;
   }
   case Intrinsics::AtomicOr:
+    // TODO(jvoung): If Dest is null or dead, then some of these
+    // operations do not need an "exchange", but just a locked op.
+    // That appears to be "worth" it for sub, or, and, and xor.
+    // xadd is probably fine vs lock add for add, and xchg is fine
+    // vs an atomic store.
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_or;
+    Op_Hi = &TargetX8632::_or;
+    break;
   case Intrinsics::AtomicAnd:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_and;
+    Op_Hi = &TargetX8632::_and;
+    break;
   case Intrinsics::AtomicXor:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_xor;
+    Op_Hi = &TargetX8632::_xor;
+    break;
   case Intrinsics::AtomicExchange:
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled AtomicRMW operation");
+    if (Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+      // just need to be moved to the ecx and ebx registers.
+      Op_Lo = NULL;
+      Op_Hi = NULL;
+      break;
+    }
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+    Variable *T = NULL;
+    _mov(T, Val);
+    _xchg(Addr, T);
+    _mov(Dest, T);
     return;
   }
+  // Otherwise, we need a cmpxchg loop.
+  assert(NeedsCmpxchg);
+  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
+                                           Variable *Dest, Operand *Ptr,
+                                           Operand *Val) {
+  // Expand a more complex RMW operation as a cmpxchg loop:
+  // For 64-bit:
+  //   mov     eax, [ptr]
+  //   mov     edx, [ptr + 4]
+  // .LABEL:
+  //   mov     ebx, eax
+  //   <Op_Lo> ebx, <desired_adj_lo>
+  //   mov     ecx, edx
+  //   <Op_Hi> ecx, <desired_adj_hi>
+  //   lock cmpxchg8b [ptr]
+  //   jne     .LABEL
+  //   mov     <dest_lo>, eax
+  //   mov     <dest_lo>, edx
+  //
+  // For 32-bit:
+  //   mov     eax, [ptr]
+  // .LABEL:
+  //   mov     <reg>, eax
+  //   op      <reg>, [desired_adj]
+  //   lock cmpxchg [ptr], <reg>
+  //   jne     .LABEL
+  //   mov     <dest>, eax
+  //
+  // If Op_{Lo,Hi} are NULL, then just copy the value.
+  Val = legalize(Val);
+  Type Ty = Val->getType();
+  if (Ty == IceType_i64) {
+    Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+    _mov(T_eax, loOperand(Addr));
+    _mov(T_edx, hiOperand(Addr));
+    Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
+    const bool IsXchg8b = Op_Lo == NULL && Op_Hi == NULL;
+    if (!IsXchg8b) {
+      Context.insert(Label);
+      _mov(T_ebx, T_eax);
+      (this->*Op_Lo)(T_ebx, loOperand(Val));
+      _mov(T_ecx, T_edx);
+      (this->*Op_Hi)(T_ecx, hiOperand(Val));
+    } else {
+      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+      // It just needs the Val loaded into ebx and ecx.
+      // That can also be done before the loop.
+      _mov(T_ebx, loOperand(Val));
+      _mov(T_ecx, hiOperand(Val));
+      Context.insert(Label);
+    }
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    _br(InstX8632Br::Br_ne, Label);
+    if (!IsXchg8b) {
+      // If Val is a variable, model the extended live range of Val through
+      // the end of the loop, since it will be re-used by the loop.
+      if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+        Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+        Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+        Context.insert(InstFakeUse::create(Func, ValLo));
+        Context.insert(InstFakeUse::create(Func, ValHi));
+      }
+    } else {
+      // For xchg, the loop is slightly smaller and ebx/ecx are used.
+      Context.insert(InstFakeUse::create(Func, T_ebx));
+      Context.insert(InstFakeUse::create(Func, T_ecx));
+    }
+    // The address base is also reused in the loop.
+    Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+  Variable *T_eax = makeReg(Ty, Reg_eax);
+  _mov(T_eax, Addr);
+  InstX8632Label *Label = InstX8632Label::create(Func, this);
+  Context.insert(Label);
+  // We want to pick a different register for T than Eax, so don't use
+  // _mov(T == NULL, T_eax).
+  Variable *T = makeReg(Ty);
+  _mov(T, T_eax);
+  (this->*Op_Lo)(T, Val);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, T, Locked);
+  _br(InstX8632Br::Br_ne, Label);
+  // If Val is a variable, model the extended live range of Val through
+  // the end of the loop, since it will be re-used by the loop.
+  if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+    Context.insert(InstFakeUse::create(Func, ValVar));
+  }
+  // The address base is also reused in the loop.
+  Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+  _mov(Dest, T_eax);
 }
 
 namespace {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 001f4e6..4953ffc 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -95,9 +95,15 @@
   virtual void doAddressOptLoad();
   virtual void doAddressOptStore();
 
+  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+                          Operand *Desired);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
 
+  typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
+  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+                                Variable *Dest, Operand *Ptr, Operand *Val);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -177,6 +183,22 @@
   void _cmp(Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
   }
+  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+                bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg::create(Func, DestOrAddr, Eax, Desired, Locked));
+    // Mark eax as possibly modified by cmpxchg.
+    Context.insert(
+        InstFakeDef::create(Func, Eax, llvm::dyn_cast<Variable>(DestOrAddr)));
+  }
+  void _cmpxchg8b(OperandX8632 *Addr, Variable *Edx, Variable *Eax,
+                  Variable *Ecx, Variable *Ebx, bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg8b::create(Func, Addr, Edx, Eax, Ecx, Ebx, Locked));
+    // Mark edx, and eax as possibly modified by cmpxchg8b.
+    Context.insert(InstFakeDef::create(Func, Edx));
+    Context.insert(InstFakeDef::create(Func, Eax));
+  }
   void _cvt(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Cvt::create(Func, Dest, Src0));
   }
@@ -232,6 +254,9 @@
   void _mulss(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Mulss::create(Func, Dest, Src0));
   }
+  void _neg(Variable *SrcDest) {
+    Context.insert(InstX8632Neg::create(Func, SrcDest));
+  }
   void _or(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Or::create(Func, Dest, Src0));
   }
@@ -294,7 +319,14 @@
     Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
     // The xadd exchanges Dest and Src (modifying Src).
     // Model that update with a FakeDef.
-    Context.insert(InstFakeDef::create(Func, Src));
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
+  }
+  void _xchg(Operand *Dest, Variable *Src) {
+    Context.insert(InstX8632Xchg::create(Func, Dest, Src));
+    // The xchg modifies Dest and Src -- model that update with a FakeDef.
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
   }
   void _xor(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Xor::create(Func, Dest, Src0));