Subzero: lower the rest of the atomic operations.

64-bit ops are expanded via a cmpxchg8b loop.

64/32-bit and/or/xor are also expanded into a cmpxchg /
cmpxchg8b loop.

Add a cross test for atomic RMW operations and
compare and swap.

Misc: Test that atomic.is.lock.free can be optimized out if result is ignored.

TODO:
* optimize compare and swap with compare+branch further down
instruction stream.

* optimize atomic RMW when the return value is ignored
(adds a locked field to binary ops though).

* We may want to do some actual target-dependent basic
block splitting + expansion (the instructions inserted by
the expansion must reference the pre-colored registers,
etc.). Otherwise, we are currently getting by with modeling
the extended liveness of the variables used in the loops
using fake uses.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=jfb@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/362463002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index a37b10f..be6c54c 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -57,6 +57,11 @@
                            metavar='PATH',
                            help='Path to LLVM executables like llc ' +
                                 '(defaults to $LLVM_BIN_PATH)')
+    argparser.add_argument('--crosstest-bitcode', required=False,
+                           default=1, type=int,
+                           help='Compile non-subzero crosstest object file ' +
+                           'from the same bitcode as the subzero object. ' +
+                           'If 0, then compile it straight from source.')
     args = argparser.parse_args()
 
     objs = []
@@ -113,7 +118,9 @@
         # failures.  This behavior can be inspected by switching
         # use_llc between True and False.
         use_llc = False
-        if use_llc:
+        if not args.crosstest_bitcode:
+            objs.append(arg)
+        elif use_llc:
             shellcmd([os.path.join(llvm_bin_path, 'llc'),
                       '-filetype=obj',
                       '-o=' + obj_llc,
@@ -125,4 +132,4 @@
     linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
     shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
              objs +
-             ['-lm', '-o', os.path.join(args.dir, args.output)])
+             ['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index cf821e2..bba53d7 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -64,6 +64,17 @@
         --driver=test_icmp_main.cpp \
         --output=test_icmp_O${optlevel}
 
+    # Compile the non-subzero object files straight from source
+    # since the native LLVM backend does not understand how to
+    # lower NaCl-specific intrinsics.
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+       --dir="${OUTDIR}" \
+       --llvm-bin-path="${LLVM_BIN_PATH}" \
+       --test=test_sync_atomic.cpp \
+       --crosstest-bitcode=0 \
+       --driver=test_sync_atomic_main.cpp \
+       --output=test_sync_atomic_O${optlevel}
+
 done
 
 for optlevel in ${OPTLEVELS} ; do
@@ -74,4 +85,5 @@
     "${OUTDIR}"/test_fcmp_O${optlevel}
     "${OUTDIR}"/test_global_O${optlevel}
     "${OUTDIR}"/test_icmp_O${optlevel}
+    "${OUTDIR}"/test_sync_atomic_O${optlevel}
 done
diff --git a/crosstest/test_sync_atomic.cpp b/crosstest/test_sync_atomic.cpp
new file mode 100644
index 0000000..05d0336
--- /dev/null
+++ b/crosstest/test_sync_atomic.cpp
@@ -0,0 +1,63 @@
+//===- subzero/crosstest/test_sync_atomic.cpp - Implementation for tests --===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test that all the atomic RMW instructions and compare and swap
+// work across the allowed atomic types. This uses the __sync_* builtins
+// to test the atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "test_sync_atomic.h"
+
+#define X(inst, type)                                                   \
+  type test_##inst(bool fetch_first, volatile type *ptr, type a) {      \
+    if (fetch_first) {                                                  \
+      return __sync_fetch_and_##inst(ptr, a);                           \
+    } else {                                                            \
+      return __sync_##inst##_and_fetch(ptr, a);                         \
+    }                                                                   \
+  }                                                                     \
+  type test_alloca_##inst(bool fetch, volatile type *ptr, type a) {     \
+    const size_t buf_size = 8;                                          \
+    type buf[buf_size];                                                 \
+    for (size_t i = 0; i < buf_size; ++i) {                             \
+      if (fetch) {                                                      \
+        buf[i] = __sync_fetch_and_##inst(ptr, a);                       \
+      } else {                                                          \
+        buf[i] = __sync_##inst##_and_fetch(ptr, a);                     \
+      }                                                                 \
+    }                                                                   \
+    type sum = 0;                                                       \
+    for (size_t i = 0; i < buf_size; ++i) {                             \
+      sum += buf[i];                                                    \
+    }                                                                   \
+    return sum;                                                         \
+  }                                                                     \
+  type test_const_##inst(bool fetch, volatile type *ptr, type ign) {    \
+    if (fetch) {                                                        \
+      return __sync_fetch_and_##inst(ptr, 42);                          \
+    } else {                                                            \
+      return __sync_##inst##_and_fetch(ptr, 99);                        \
+    }                                                                   \
+  }
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type)                                                          \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \
+    return __sync_val_compare_and_swap(ptr, oldval, newval);             \
+  }
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic.def b/crosstest/test_sync_atomic.def
new file mode 100644
index 0000000..f84afde
--- /dev/null
+++ b/crosstest/test_sync_atomic.def
@@ -0,0 +1,50 @@
+//===- subzero/crosstest/test_sync_atomic.def - macros for tests -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing atomic intrinsics (via sync builtins).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_SYNC_ATOMIC_DEF
+#define TEST_SYNC_ATOMIC_DEF
+
+#define STR(s) #s
+
+#define RMWOP_TABLE  \
+  /* inst */         \
+  X(add)             \
+  X(sub)             \
+  X(or)              \
+  X(and)             \
+  X(xor)
+//#define X(inst)
+
+#define ATOMIC_TYPE_TABLE \
+  /* type */              \
+  X(uint8_t)              \
+  X(uint16_t)             \
+  X(uint32_t)             \
+  X(uint64_t)
+//#define X(type)
+
+#define FOR_ALL_RMWTYPES_INST(F, inst) \
+  F(inst, uint8_t)                     \
+  F(inst, uint16_t)                    \
+  F(inst, uint32_t)                    \
+  F(inst, uint64_t)
+
+#define FOR_ALL_RMWOP_TYPES(X)      \
+  FOR_ALL_RMWTYPES_INST(X, add)     \
+  FOR_ALL_RMWTYPES_INST(X, sub)     \
+  FOR_ALL_RMWTYPES_INST(X, or)      \
+  FOR_ALL_RMWTYPES_INST(X, and)     \
+  FOR_ALL_RMWTYPES_INST(X, xor)
+//#define X(inst, type)
+
+#endif // TEST_SYNC_ATOMIC_DEF
diff --git a/crosstest/test_sync_atomic.h b/crosstest/test_sync_atomic.h
new file mode 100644
index 0000000..a88cd73
--- /dev/null
+++ b/crosstest/test_sync_atomic.h
@@ -0,0 +1,29 @@
+//===- subzero/crosstest/test_sync_atomic.h - Test prototypes ---*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing atomic
+// intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_sync_atomic.def"
+
+#define X(inst, type)                                                   \
+  type test_##inst(bool fetch_first, volatile type *ptr, type a);       \
+  type test_alloca_##inst(bool fetch, volatile type *ptr, type a);      \
+  type test_const_##inst(bool fetch, volatile type *ptr, type ignored);
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type)   \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval);
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic_main.cpp b/crosstest/test_sync_atomic_main.cpp
new file mode 100644
index 0000000..0cae7cd
--- /dev/null
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -0,0 +1,298 @@
+//===- subzero/crosstest/test_sync_atomic_main.cpp - Driver for tests -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing atomic intrinsics, via the sync builtins.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_sync_atomic.cpp --crosstest-bitcode=0 \
+   --driver=test_sync_atomic_main.cpp --prefix=Subzero_ \
+   --output=test_sync_atomic */
+
+#include <pthread.h>
+#include <stdint.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+// Include test_sync_atomic.h twice - once normally, and once within the
+// Subzero_ namespace, corresponding to the llc and Subzero translated
+// object files, respectively.
+#include "test_sync_atomic.h"
+namespace Subzero_ {
+#include "test_sync_atomic.h"
+}
+
+volatile uint64_t Values[] = {
+    0,                    1,                    0x7e,
+    0x7f,                 0x80,                 0x81,
+    0xfe,                 0xff,                 0x7ffe,
+    0x7fff,               0x8000,               0x8001,
+    0xfffe,               0xffff,
+    0x007fffff /*Max subnormal + */,
+    0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */,
+    0x7f800000 /*+Inf*/,  0xff800000 /*-Inf*/,
+    0x7fa00000 /*SNaN*/,  0x7fc00000 /*QNaN*/,
+    0x7ffffffe,           0x7fffffff,           0x80000000,
+    0x80000001,           0xfffffffe,           0xffffffff,
+    0x100000000ll,        0x100000001ll,
+    0x000fffffffffffffll /*Max subnormal + */,
+    0x0010000000000000ll /*Min+ */,
+    0x7fefffffffffffffll /*Max+ */,
+    0x7ff0000000000000ll /*+Inf*/,
+    0xfff0000000000000ll /*-Inf*/,
+    0x7ff0000000000001ll /*SNaN*/,
+    0x7ff8000000000000ll /*QNaN*/,
+    0x7ffffffffffffffell, 0x7fffffffffffffffll, 0x8000000000000000ll,
+    0x8000000000000001ll, 0xfffffffffffffffell, 0xffffffffffffffffll };
+
+const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+
+struct {
+  volatile uint8_t l8;
+  volatile uint16_t l16;
+  volatile uint32_t l32;
+  volatile uint64_t l64;
+} AtomicLocs;
+
+template <typename Type>
+void testAtomicRMW(volatile Type *AtomicLoc,
+                   size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef Type (*FuncType)(bool, volatile Type*, Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(inst)                                                             \
+  {                                                                         \
+    STR(inst), test_##inst, Subzero_::test_##inst                           \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst   \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_const", test_const_##inst, Subzero_::test_const_##inst      \
+  },
+      RMWOP_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValues; ++i) {
+      Type Value1 = static_cast<Type>(Values[i]);
+      for (size_t j = 0; j < NumValues; ++j) {
+        Type Value2 = static_cast<Type>(Values[j]);
+        for (size_t k = 0; k < 2; ++k) {
+          bool fetch_first = k;
+          ++TotalTests;
+          *AtomicLoc = Value1;
+          Type ResultSz1 = Funcs[f].FuncSz(
+              fetch_first, AtomicLoc, Value2);
+          Type ResultSz2 = *AtomicLoc;
+          *AtomicLoc = Value1;
+          Type ResultLlc1 = Funcs[f].FuncLlc(
+              fetch_first, AtomicLoc, Value2);
+          Type ResultLlc2 = *AtomicLoc;
+          if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+            ++Passes;
+          } else {
+            ++Failures;
+            std::cout << "test_" << Funcs[f].Name
+                      << (CHAR_BIT * sizeof(Type)) << "("
+                      << static_cast<uint64_t>(Value1) << ", "
+                      << static_cast<uint64_t>(Value2)
+                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                      << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+                      << "\n";
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Type>
+void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
+                           size_t &Passes, size_t &Failures) {
+  for (size_t i = 0; i < NumValues; ++i) {
+    Type Value1 = static_cast<Type>(Values[i]);
+    for (size_t j = 0; j < NumValues; ++j) {
+      Type Value2 = static_cast<Type>(Values[j]);
+      for (size_t f = 0; f < 2; ++f) {
+        bool flip = f;
+        ++TotalTests;
+        *AtomicLoc = Value1;
+        Type ResultSz1 = Subzero_::test_val_cmp_swap(
+            AtomicLoc, flip ? Value2 : Value1, Value2);
+        Type ResultSz2 = *AtomicLoc;
+        *AtomicLoc = Value1;
+        Type ResultLlc1 = test_val_cmp_swap(
+            AtomicLoc, flip ? Value2 : Value1, Value2);
+        Type ResultLlc2 = *AtomicLoc;
+        if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test_val_cmp_swap" << (CHAR_BIT * sizeof(Type)) << "("
+                    << static_cast<uint64_t>(Value1) << ", "
+                    << static_cast<uint64_t>(Value2)
+                    << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                    << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                    << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                    << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+                    << "\n";
+        }
+      }
+    }
+  }
+}
+
+template <typename Type>
+struct ThreadData {
+  Type (*FuncPtr)(bool, volatile Type*, Type);
+  bool Fetch;
+  volatile Type *Ptr;
+  Type Adjustment;
+};
+
+template <typename Type>
+void *threadWrapper(void *Data) {
+  const size_t NumReps = 8000;
+  ThreadData<Type> *TData = reinterpret_cast<ThreadData<Type>*>(Data);
+  for (size_t i = 0; i < NumReps; ++i) {
+    (void)TData->FuncPtr(TData->Fetch, TData->Ptr, TData->Adjustment);
+  }
+  return NULL;
+}
+
+template <typename Type>
+void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
+                          size_t &Passes, size_t &Failures) {
+  typedef Type (*FuncType)(bool, volatile Type*, Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(inst)                                                             \
+  {                                                                         \
+    STR(inst), test_##inst, Subzero_::test_##inst                           \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst   \
+  },
+      RMWOP_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+  // Just test a few values, otherwise it takes a *really* long time.
+  volatile uint64_t ValuesSubset[] = { 1, 0x7e, 0x000fffffffffffffffll };
+  const size_t NumValuesSubset = sizeof(ValuesSubset) / sizeof(*ValuesSubset);
+
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValuesSubset; ++i) {
+      Type Value1 = static_cast<Type>(ValuesSubset[i]);
+      for (size_t j = 0; j < NumValuesSubset; ++j) {
+        Type Value2 = static_cast<Type>(ValuesSubset[j]);
+        bool fetch_first = true;
+        ThreadData<Type> TDataSz = {
+          Funcs[f].FuncSz, fetch_first, AtomicLoc, Value2 };
+        ThreadData<Type> TDataLlc = {
+          Funcs[f].FuncLlc, fetch_first, AtomicLoc, Value2 };
+        ++TotalTests;
+        const size_t NumThreads = 4;
+        pthread_t t[NumThreads];
+
+        // Try N threads w/ just Llc.
+        *AtomicLoc = Value1;
+        for (size_t m = 0; m < NumThreads; ++m) {
+          pthread_create(&t[m], NULL, &threadWrapper<Type>,
+                         reinterpret_cast<void *>(&TDataLlc));
+        }
+        for (size_t m = 0; m < NumThreads; ++m) {
+          pthread_join(t[m], NULL);
+        }
+        Type ResultLlc = *AtomicLoc;
+
+        // Try N threads w/ both Sz and Llc.
+        *AtomicLoc = Value1;
+        for (size_t m = 0; m < NumThreads; ++m) {
+          if (pthread_create(&t[m], NULL, &threadWrapper<Type>,
+                             m % 2 == 0
+                             ? reinterpret_cast<void *>(&TDataLlc)
+                             : reinterpret_cast<void *>(&TDataSz)) != 0) {
+            ++Failures;
+            std::cout << "pthread_create failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
+        }
+        for (size_t m = 0; m < NumThreads; ++m) {
+          if (pthread_join(t[m], NULL) != 0) {
+            ++Failures;
+            std::cout << "pthread_join failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
+        }
+        Type ResultMixed = *AtomicLoc;
+
+        if (ResultLlc == ResultMixed) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test_with_threads_" << Funcs[f].Name
+                    << (8 * sizeof(Type)) << "("
+                    << static_cast<uint64_t>(Value1) << ", "
+                    << static_cast<uint64_t>(Value2)
+                    << "): llc=" << static_cast<uint64_t>(ResultLlc)
+                    << " mixed=" << static_cast<uint64_t>(ResultMixed)
+                    << "\n";
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testAtomicRMW<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
+  testAtomicRMW<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
+  testAtomicRMW<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
+  testAtomicRMW<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint8_t>(
+      &AtomicLocs.l8, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint16_t>(
+      &AtomicLocs.l16, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint32_t>(
+      &AtomicLocs.l32, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint64_t>(
+      &AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint8_t>(
+      &AtomicLocs.l8, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint16_t>(
+      &AtomicLocs.l16, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint32_t>(
+      &AtomicLocs.l32, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint64_t>(
+      &AtomicLocs.l64, TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index cd5095f..c0e8c8d 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -51,9 +51,8 @@
     llvm::array_lengthof(TypeX8632Attributes);
 
 const char *InstX8632SegmentRegNames[] = {
-#define X(val, name)                                                           \
-  name,
-    SEG_REGX8632_TABLE
+#define X(val, name) name,
+  SEG_REGX8632_TABLE
 #undef X
 };
 const size_t InstX8632SegmentRegNamesSize =
@@ -140,6 +139,33 @@
   addSource(Source);
 }
 
+InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
+                                   Variable *Eax, Variable *Desired,
+                                   bool Locked)
+    : InstX8632Lockable(Func, InstX8632::Cmpxchg, 3,
+                        llvm::dyn_cast<Variable>(DestOrAddr), Locked) {
+  assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+  addSource(DestOrAddr);
+  addSource(Eax);
+  addSource(Desired);
+}
+
+InstX8632Cmpxchg8b::InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Addr,
+                                       Variable *Edx, Variable *Eax,
+                                       Variable *Ecx, Variable *Ebx,
+                                       bool Locked)
+    : InstX8632Lockable(Func, InstX8632::Cmpxchg, 5, NULL, Locked) {
+  assert(Edx->getRegNum() == TargetX8632::Reg_edx);
+  assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+  assert(Ecx->getRegNum() == TargetX8632::Reg_ecx);
+  assert(Ebx->getRegNum() == TargetX8632::Reg_ebx);
+  addSource(Addr);
+  addSource(Edx);
+  addSource(Eax);
+  addSource(Ecx);
+  addSource(Ebx);
+}
+
 InstX8632Cvt::InstX8632Cvt(Cfg *Func, Variable *Dest, Operand *Source)
     : InstX8632(Func, InstX8632::Cvt, 1, Dest) {
   addSource(Source);
@@ -284,9 +310,14 @@
 
 InstX8632Xadd::InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source,
                              bool Locked)
-    : InstX8632(Func, InstX8632::Xadd, 2, llvm::dyn_cast<Variable>(Dest)),
-      Locked(Locked) {
-  HasSideEffects = Locked;
+    : InstX8632Lockable(Func, InstX8632::Xadd, 2,
+                        llvm::dyn_cast<Variable>(Dest), Locked) {
+  addSource(Dest);
+  addSource(Source);
+}
+
+InstX8632Xchg::InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source)
+    : InstX8632(Func, InstX8632::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
   addSource(Dest);
   addSource(Source);
 }
@@ -398,6 +429,7 @@
   Str << "\n";
 }
 
+template <> const char *InstX8632Neg::Opcode = "neg";
 template <> const char *InstX8632Add::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
@@ -554,6 +586,48 @@
   dumpSources(Func);
 }
 
+void InstX8632Cmpxchg::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  if (Locked) {
+    Str << "\tlock";
+  }
+  Str << "\tcmpxchg\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(2)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmpxchg::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg." << getSrc(0)->getType() << " ";
+  dumpSources(Func);
+}
+
+void InstX8632Cmpxchg8b::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 5);
+  if (Locked) {
+    Str << "\tlock";
+  }
+  Str << "\tcmpxchg8b\t";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmpxchg8b::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Str << "cmpxchg8b ";
+  dumpSources(Func);
+}
+
 void InstX8632Cvt::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -955,10 +1029,9 @@
 void InstX8632Xadd::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   if (Locked) {
-    Str << "\tlock xadd ";
-  } else {
-    Str << "\txadd\t";
+    Str << "\tlock";
   }
+  Str << "\txadd\t";
   getSrc(0)->emit(Func);
   Str << ", ";
   getSrc(1)->emit(Func);
@@ -975,6 +1048,22 @@
   dumpSources(Func);
 }
 
+void InstX8632Xchg::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\txchg\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Xchg::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getSrc(0)->getType();
+  Str << "xchg." << Ty << " ";
+  dumpSources(Func);
+}
+
 void OperandX8632::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "<OperandX8632>";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index baf072a..25beb6d 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -54,9 +54,8 @@
 public:
   enum SegmentRegisters {
     DefaultSegment = -1,
-#define X(val, name)                                                           \
-    val,
-      SEG_REGX8632_TABLE
+#define X(val, name) val,
+    SEG_REGX8632_TABLE
 #undef X
         SegReg_NUM
   };
@@ -142,6 +141,8 @@
     Br,
     Call,
     Cdq,
+    Cmpxchg,
+    Cmpxchg8b,
     Cvt,
     Div,
     Divps,
@@ -162,6 +163,7 @@
     Mul,
     Mulps,
     Mulss,
+    Neg,
     Or,
     Pop,
     Push,
@@ -183,6 +185,7 @@
     Ucomiss,
     UD2,
     Xadd,
+    Xchg,
     Xor
   };
   static const char *getWidthString(Type Ty);
@@ -328,6 +331,41 @@
   virtual ~InstX8632Call() {}
 };
 
+template <InstX8632::InstKindX8632 K>
+class InstX8632Unaryop : public InstX8632 {
+public:
+  // Create an unary-op instruction like neg.
+  // The source and dest are the same variable.
+  static InstX8632Unaryop *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX8632Unaryop>())
+        InstX8632Unaryop(Func, SrcDest);
+  }
+  virtual void emit(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    getSrc(0)->emit(Func);
+    Str << "\n";
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632Unaryop(Cfg *Func, Operand *SrcDest)
+      : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+    addSource(SrcDest);
+  }
+  InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+  InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Unaryop() {}
+  static const char *Opcode;
+};
+
 // See the definition of emitTwoAddress() for a description of
 // ShiftHack.
 void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
@@ -400,6 +438,7 @@
   static const char *Opcode;
 };
 
+typedef InstX8632Unaryop<InstX8632::Neg> InstX8632Neg;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
 typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
@@ -423,6 +462,28 @@
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
 
+// Base class for a lockable x86-32 instruction (emits a locked prefix).
+class InstX8632Lockable : public InstX8632 {
+public:
+  virtual void emit(const Cfg *Func) const = 0;
+  virtual void dump(const Cfg *Func) const;
+
+protected:
+  bool Locked;
+
+  InstX8632Lockable(Cfg *Func, InstKindX8632 Kind, SizeT Maxsrcs,
+                    Variable *Dest, bool Locked)
+      : InstX8632(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
+    // Assume that such instructions are used for Atomics and be careful
+    // with optimizations.
+    HasSideEffects = Locked;
+  }
+
+private:
+  InstX8632Lockable(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+  InstX8632Lockable &operator=(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+};
+
 // Mul instruction - unsigned multiply.
 class InstX8632Mul : public InstX8632 {
 public:
@@ -502,6 +563,57 @@
   virtual ~InstX8632Cdq() {}
 };
 
+// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
+// equals eax. If so, the ZF is set and <desired> is stored in <dest>.
+// If not, ZF is cleared and <dest> is copied to eax (or subregister).
+// <dest> can be a register or memory, while <desired> must be a register.
+// It is the user's responsiblity to mark eax with a FakeDef.
+class InstX8632Cmpxchg : public InstX8632Lockable {
+public:
+  static InstX8632Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                                  Variable *Desired, bool Locked) {
+    return new (Func->allocate<InstX8632Cmpxchg>())
+        InstX8632Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg); }
+
+private:
+  InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+                   Variable *Desired, bool Locked);
+  InstX8632Cmpxchg(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpxchg &operator=(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpxchg() {}
+};
+
+// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64>
+// equals edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>.
+// If not, ZF is cleared and <m64> is copied to edx:eax.
+// The caller is responsible for inserting FakeDefs to mark edx
+// and eax as modified.
+// <m64> must be a memory operand.
+class InstX8632Cmpxchg8b : public InstX8632Lockable {
+public:
+  static InstX8632Cmpxchg8b *create(Cfg *Func, OperandX8632 *Dest,
+                                    Variable *Edx, Variable *Eax, Variable *Ecx,
+                                    Variable *Ebx, bool Locked) {
+    return new (Func->allocate<InstX8632Cmpxchg8b>())
+        InstX8632Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg8b); }
+
+private:
+  InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Dest, Variable *Edx,
+                     Variable *Eax, Variable *Ecx, Variable *Ebx, bool Locked);
+  InstX8632Cmpxchg8b(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpxchg8b &
+  operator=(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpxchg8b() {}
+};
+
 // Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i}
 // as appropriate.  s=float, d=double, i=int.  X and Y are determined
 // from dest/src types.  Sign and zero extension on the integer
@@ -861,7 +973,7 @@
 //
 // Both the dest and source are updated. The caller should then insert a
 // FakeDef to reflect the second udpate.
-class InstX8632Xadd : public InstX8632 {
+class InstX8632Xadd : public InstX8632Lockable {
 public:
   static InstX8632Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
                                bool Locked) {
@@ -873,14 +985,35 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, Xadd); }
 
 private:
-  bool Locked;
-
   InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
   InstX8632Xadd(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
   InstX8632Xadd &operator=(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
   virtual ~InstX8632Xadd() {}
 };
 
+// Exchange instruction.  Exchanges the first operand (destination
+// operand) with the second operand (source operand). At least one of
+// the operands must be a register (and the other can be reg or mem).
+// Both the Dest and Source are updated. If there is a memory operand,
+// then the instruction is automatically "locked" without the need for
+// a lock prefix.
+class InstX8632Xchg : public InstX8632 {
+public:
+  static InstX8632Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
+    return new (Func->allocate<InstX8632Xchg>())
+        InstX8632Xchg(Func, Dest, Source);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Xchg); }
+
+private:
+  InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source);
+  InstX8632Xchg(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+  InstX8632Xchg &operator=(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Xchg() {}
+};
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTX8632_H
diff --git a/src/IceIntrinsics.cpp b/src/IceIntrinsics.cpp
index 02562b5..b83513f 100644
--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp
@@ -46,7 +46,7 @@
       "nacl.atomic.fence" },
     { { { Intrinsics::AtomicFenceAll, true }, { IceType_void }, 1 },
       "nacl.atomic.fence.all" },
-    { { { Intrinsics::AtomicIsLockFree, true },
+    { { { Intrinsics::AtomicIsLockFree, false },
         { IceType_i1, IceType_i32, IceType_i32 }, 3 },
       "nacl.atomic.is.lock.free" },
 
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index f1b8c25..bf11573 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1968,7 +1968,7 @@
 
 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   switch (Instr->getIntrinsicInfo().ID) {
-  case Intrinsics::AtomicCmpxchg:
+  case Intrinsics::AtomicCmpxchg: {
     if (!Intrinsics::VerifyMemoryOrder(
              llvm::cast<ConstantInteger>(Instr->getArg(3))->getValue())) {
       Func->setError("Unexpected memory ordering (success) for AtomicCmpxchg");
@@ -1979,9 +1979,18 @@
       Func->setError("Unexpected memory ordering (failure) for AtomicCmpxchg");
       return;
     }
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled intrinsic");
+    Variable *DestPrev = Instr->getDest();
+    Operand *PtrToMem = Instr->getArg(0);
+    Operand *Expected = Instr->getArg(1);
+    Operand *Desired = Instr->getArg(2);
+    lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+    // TODO(jvoung): If we peek ahead a few instructions and see how
+    // DestPrev is used (typically via another compare and branch),
+    // we may be able to optimize. If the result truly is used by a
+    // compare + branch, and the comparison is for equality, then we can
+    // optimize out the later compare, and fuse with the later branch.
     return;
+  }
   case Intrinsics::AtomicFence:
     if (!Intrinsics::VerifyMemoryOrder(
              llvm::cast<ConstantInteger>(Instr->getArg(0))->getValue())) {
@@ -2183,18 +2192,54 @@
   return;
 }
 
+void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
+                                     Operand *Expected, Operand *Desired) {
+  if (Expected->getType() == IceType_i64) {
+    // Reserve the pre-colored registers first, before adding any more
+    // infinite-weight variables from FormMemoryOperand's legalization.
+    Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+    Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+    _mov(T_eax, loOperand(Expected));
+    _mov(T_edx, hiOperand(Expected));
+    _mov(T_ebx, loOperand(Desired));
+    _mov(T_ecx, hiOperand(Desired));
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  Variable *T_eax = makeReg(Expected->getType(), Reg_eax);
+  _mov(T_eax, Expected);
+  OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+  Variable *DesiredReg = legalizeToVar(Desired);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+  _mov(DestPrev, T_eax);
+}
+
 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
                                  Operand *Ptr, Operand *Val) {
+  bool NeedsCmpxchg = false;
+  LowerBinOp Op_Lo = NULL;
+  LowerBinOp Op_Hi = NULL;
   switch (Operation) {
   default:
     Func->setError("Unknown AtomicRMW operation");
     return;
   case Intrinsics::AtomicAdd: {
     if (Dest->getType() == IceType_i64) {
-      // Do a nasty cmpxchg8b loop. Factor this into a function.
-      // TODO(jvoung): fill it in.
-      Func->setError("Unhandled AtomicRMW operation");
-      return;
+      // All the fall-through paths must set this to true, but use this
+      // for asserting.
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_add;
+      Op_Hi = &TargetX8632::_adc;
+      break;
     }
     OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
     const bool Locked = true;
@@ -2206,26 +2251,160 @@
   }
   case Intrinsics::AtomicSub: {
     if (Dest->getType() == IceType_i64) {
-      // Do a nasty cmpxchg8b loop.
-      // TODO(jvoung): fill it in.
-      Func->setError("Unhandled AtomicRMW operation");
-      return;
+      NeedsCmpxchg = true;
+      Op_Lo = &TargetX8632::_sub;
+      Op_Hi = &TargetX8632::_sbb;
+      break;
     }
-    // Generate a memory operand from Ptr.
-    // neg...
-    // Then do the same as AtomicAdd.
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled AtomicRMW operation");
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+    const bool Locked = true;
+    Variable *T = NULL;
+    _mov(T, Val);
+    _neg(T);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
     return;
   }
   case Intrinsics::AtomicOr:
+    // TODO(jvoung): If Dest is null or dead, then some of these
+    // operations do not need an "exchange", but just a locked op.
+    // That appears to be "worth" it for sub, or, and, and xor.
+    // xadd is probably fine vs lock add for add, and xchg is fine
+    // vs an atomic store.
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_or;
+    Op_Hi = &TargetX8632::_or;
+    break;
   case Intrinsics::AtomicAnd:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_and;
+    Op_Hi = &TargetX8632::_and;
+    break;
   case Intrinsics::AtomicXor:
+    NeedsCmpxchg = true;
+    Op_Lo = &TargetX8632::_xor;
+    Op_Hi = &TargetX8632::_xor;
+    break;
   case Intrinsics::AtomicExchange:
-    // TODO(jvoung): fill it in.
-    Func->setError("Unhandled AtomicRMW operation");
+    if (Dest->getType() == IceType_i64) {
+      NeedsCmpxchg = true;
+      // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+      // just need to be moved to the ecx and ebx registers.
+      Op_Lo = NULL;
+      Op_Hi = NULL;
+      break;
+    }
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+    Variable *T = NULL;
+    _mov(T, Val);
+    _xchg(Addr, T);
+    _mov(Dest, T);
     return;
   }
+  // Otherwise, we need a cmpxchg loop.
+  assert(NeedsCmpxchg);
+  expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
+                                           Variable *Dest, Operand *Ptr,
+                                           Operand *Val) {
+  // Expand a more complex RMW operation as a cmpxchg loop:
+  // For 64-bit:
+  //   mov     eax, [ptr]
+  //   mov     edx, [ptr + 4]
+  // .LABEL:
+  //   mov     ebx, eax
+  //   <Op_Lo> ebx, <desired_adj_lo>
+  //   mov     ecx, edx
+  //   <Op_Hi> ecx, <desired_adj_hi>
+  //   lock cmpxchg8b [ptr]
+  //   jne     .LABEL
+  //   mov     <dest_lo>, eax
+  //   mov     <dest_lo>, edx
+  //
+  // For 32-bit:
+  //   mov     eax, [ptr]
+  // .LABEL:
+  //   mov     <reg>, eax
+  //   op      <reg>, [desired_adj]
+  //   lock cmpxchg [ptr], <reg>
+  //   jne     .LABEL
+  //   mov     <dest>, eax
+  //
+  // If Op_{Lo,Hi} are NULL, then just copy the value.
+  Val = legalize(Val);
+  Type Ty = Val->getType();
+  if (Ty == IceType_i64) {
+    Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+    Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+    _mov(T_eax, loOperand(Addr));
+    _mov(T_edx, hiOperand(Addr));
+    Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+    Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+    InstX8632Label *Label = InstX8632Label::create(Func, this);
+    const bool IsXchg8b = Op_Lo == NULL && Op_Hi == NULL;
+    if (!IsXchg8b) {
+      Context.insert(Label);
+      _mov(T_ebx, T_eax);
+      (this->*Op_Lo)(T_ebx, loOperand(Val));
+      _mov(T_ecx, T_edx);
+      (this->*Op_Hi)(T_ecx, hiOperand(Val));
+    } else {
+      // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+      // It just needs the Val loaded into ebx and ecx.
+      // That can also be done before the loop.
+      _mov(T_ebx, loOperand(Val));
+      _mov(T_ecx, hiOperand(Val));
+      Context.insert(Label);
+    }
+    const bool Locked = true;
+    _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+    _br(InstX8632Br::Br_ne, Label);
+    if (!IsXchg8b) {
+      // If Val is a variable, model the extended live range of Val through
+      // the end of the loop, since it will be re-used by the loop.
+      if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+        Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+        Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+        Context.insert(InstFakeUse::create(Func, ValLo));
+        Context.insert(InstFakeUse::create(Func, ValHi));
+      }
+    } else {
+      // For xchg, the loop is slightly smaller and ebx/ecx are used.
+      Context.insert(InstFakeUse::create(Func, T_ebx));
+      Context.insert(InstFakeUse::create(Func, T_ecx));
+    }
+    // The address base is also reused in the loop.
+    Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    _mov(DestLo, T_eax);
+    _mov(DestHi, T_edx);
+    return;
+  }
+  OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+  Variable *T_eax = makeReg(Ty, Reg_eax);
+  _mov(T_eax, Addr);
+  InstX8632Label *Label = InstX8632Label::create(Func, this);
+  Context.insert(Label);
+  // We want to pick a different register for T than Eax, so don't use
+  // _mov(T == NULL, T_eax).
+  Variable *T = makeReg(Ty);
+  _mov(T, T_eax);
+  (this->*Op_Lo)(T, Val);
+  const bool Locked = true;
+  _cmpxchg(Addr, T_eax, T, Locked);
+  _br(InstX8632Br::Br_ne, Label);
+  // If Val is a variable, model the extended live range of Val through
+  // the end of the loop, since it will be re-used by the loop.
+  if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+    Context.insert(InstFakeUse::create(Func, ValVar));
+  }
+  // The address base is also reused in the loop.
+  Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+  _mov(Dest, T_eax);
 }
 
 namespace {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 001f4e6..4953ffc 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -95,9 +95,15 @@
   virtual void doAddressOptLoad();
   virtual void doAddressOptStore();
 
+  void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+                          Operand *Desired);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
 
+  typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
+  void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+                                Variable *Dest, Operand *Ptr, Operand *Val);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -177,6 +183,22 @@
   void _cmp(Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
   }
+  void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+                bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg::create(Func, DestOrAddr, Eax, Desired, Locked));
+    // Mark eax as possibly modified by cmpxchg.
+    Context.insert(
+        InstFakeDef::create(Func, Eax, llvm::dyn_cast<Variable>(DestOrAddr)));
+  }
+  void _cmpxchg8b(OperandX8632 *Addr, Variable *Edx, Variable *Eax,
+                  Variable *Ecx, Variable *Ebx, bool Locked) {
+    Context.insert(
+        InstX8632Cmpxchg8b::create(Func, Addr, Edx, Eax, Ecx, Ebx, Locked));
+    // Mark edx, and eax as possibly modified by cmpxchg8b.
+    Context.insert(InstFakeDef::create(Func, Edx));
+    Context.insert(InstFakeDef::create(Func, Eax));
+  }
   void _cvt(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Cvt::create(Func, Dest, Src0));
   }
@@ -232,6 +254,9 @@
   void _mulss(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Mulss::create(Func, Dest, Src0));
   }
+  void _neg(Variable *SrcDest) {
+    Context.insert(InstX8632Neg::create(Func, SrcDest));
+  }
   void _or(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Or::create(Func, Dest, Src0));
   }
@@ -294,7 +319,14 @@
     Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
     // The xadd exchanges Dest and Src (modifying Src).
     // Model that update with a FakeDef.
-    Context.insert(InstFakeDef::create(Func, Src));
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
+  }
+  void _xchg(Operand *Dest, Variable *Src) {
+    Context.insert(InstX8632Xchg::create(Func, Dest, Src));
+    // The xchg modifies Dest and Src -- model that update with a FakeDef.
+    Context.insert(
+        InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
   }
   void _xor(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Xor::create(Func, Dest, Src0));
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index 8dfcc61..9885b88 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -2,6 +2,7 @@
 ; size allowed.
 
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
@@ -28,6 +29,11 @@
 declare void @llvm.nacl.atomic.fence.all()
 declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
 
+; NOTE: The LLC equivalent for 16-bit atomic operations are expanded
+; as 32-bit operations. For Subzero, assume that real 16-bit operations
+; will be usable (the validator will be fixed):
+; https://code.google.com/p/nativeclient/issues/detail?id=2981
+
 ;;; Load
 
 ; x86 guarantees load/store to be atomic if naturally aligned.
@@ -107,7 +113,6 @@
 ; CHECK: movq x{{.*}}, qword
 ; CHECK: movq qword {{.*}}, x{{.*}}
 
-
 ;;; Store
 
 define void @test_atomic_store_8(i32 %iptr, i32 %v) {
@@ -169,6 +174,8 @@
 
 ;;; RMW
 
+;; add
+
 define i32 @test_atomic_rmw_add_8(i32 %iptr, i32 %v) {
 entry:
   %trunc = trunc i32 %v to i8
@@ -180,7 +187,7 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_add_8
 ; CHECK: lock xadd byte {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
 
 define i32 @test_atomic_rmw_add_16(i32 %iptr, i32 %v) {
 entry:
@@ -192,7 +199,7 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_add_16
 ; CHECK: lock xadd word {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
 
 define i32 @test_atomic_rmw_add_32(i32 %iptr, i32 %v) {
 entry:
@@ -202,16 +209,61 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_add_32
 ; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
 
-;define i64 @test_atomic_rmw_add_64(i32 %iptr, i64 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i64*
-;  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
-;  ret i64 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_add_64
-; CHECKLATER: uh need a... cmpxchg8b loop.
+define i64 @test_atomic_rmw_add_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_add_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; RHS of add cannot be any of the e[abcd]x regs because they are
+; clobbered in the loop, and the RHS needs to be remain live.
+; CHECK: add ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: adc ecx, {{.*e.[^x]}}
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It can be esi, edi, or ebp though, for example (so we need to be careful
+; about rejecting eb* and ed*.)
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+; Test with some more register pressure. When we have an alloca, ebp is
+; used to manage the stack frame, so it cannot be used as a register either.
+declare void @use_ptr(i32 %iptr)
+
+define i64 @test_atomic_rmw_add_64_alloca(i32 %iptr, i64 %v) {
+entry:
+  %alloca_ptr = alloca i8, i32 16, align 16
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
+  store i8 0, i8* %alloca_ptr, align 1
+  store i8 1, i8* %alloca_ptr, align 1
+  store i8 2, i8* %alloca_ptr, align 1
+  store i8 3, i8* %alloca_ptr, align 1
+  %__5 = ptrtoint i8* %alloca_ptr to i32
+  call void @use_ptr(i32 %__5)
+  ret i64 %old
+}
+; CHECK-LABEL: test_atomic_rmw_add_64_alloca
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It also cannot be ebp since we use that for alloca. Also make sure it's
+; not esp, since that's the stack pointer and mucking with it will break
+; the later use_ptr function call.
+; That pretty much leaves esi, or edi as the only viable registers.
+; CHECK: lock cmpxchg8b qword ptr [e{{[ds]}}i]
+; CHECK: call use_ptr
 
 define i32 @test_atomic_rmw_add_32_ignored(i32 %iptr, i32 %v) {
 entry:
@@ -219,129 +271,562 @@
   %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %v, i32 6)
   ret i32 %v
 }
+; Technically this could use "lock add" instead of "lock xadd", if liveness
+; tells us that the destination variable is dead.
 ; CHECK-LABEL: test_atomic_rmw_add_32_ignored
 ; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
 
-;define i32 @test_atomic_rmw_sub_32(i32 %iptr, i32 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
-;  ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_sub_32
-; CHECKLATER: neg
-; CHECKLATER: lock
-; CHECKLATER: xadd
+; Atomic RMW 64 needs to be expanded into its own loop.
+; Make sure that works w/ non-trivial function bodies.
+define i64 @test_atomic_rmw_add_64_loop(i32 %iptr, i64 %v) {
+entry:
+  %x = icmp ult i64 %v, 100
+  br i1 %x, label %err, label %loop
 
-;define i32 @test_atomic_rmw_or_32(i32 %iptr, i32 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
-;  ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_or_32
-; Need a cmpxchg loop.
+loop:
+  %v_next = phi i64 [ %v, %entry ], [ %next, %loop ]
+  %ptr = inttoptr i32 %iptr to i64*
+  %next = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v_next, i32 6)
+  %success = icmp eq i64 %next, 100
+  br i1 %success, label %done, label %loop
 
-;define i32 @test_atomic_rmw_and_32(i32 %iptr, i32 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
-;  ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_and_32
-; Also a cmpxchg loop.
+done:
+  ret i64 %next
 
-;define i32 @test_atomic_rmw_xor_32(i32 %iptr, i32 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
-;  ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_xor_32
-; Also a cmpxchg loop.
+err:
+  ret i64 0
+}
+; CHECK-LABEL: test_atomic_rmw_add_64_loop
+; CHECK: push ebx
+; CHECK-LABEL: .Ltest_atomic_rmw_add_64_loop{{.*}}loop
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: add ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: adc ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+; CHECK-LABEL: .Ltest_atomic_rmw_add_64_loop{{.*}}done
 
-;define i32 @test_atomic_rmw_xchg_32(i32 %iptr, i32 %v) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
-;  ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_xchg_32
+;; sub
+
+define i32 @test_atomic_rmw_sub_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 2, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_sub_8
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd byte {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i32 @test_atomic_rmw_sub_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 2, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_sub_16
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd word {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i32 @test_atomic_rmw_sub_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_sub_32
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd dword {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i64 @test_atomic_rmw_sub_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 2, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_sub_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: sub ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: sbb ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+
+define i32 @test_atomic_rmw_sub_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; Could use "lock sub" instead of "neg; lock xadd"
+; CHECK-LABEL: test_atomic_rmw_sub_32_ignored
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd dword {{.*}}, [[REG]]
+
+;; or
+
+define i32 @test_atomic_rmw_or_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 3, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_or_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; Dest cannot be eax here, because eax is used for the old value. Also want
+; to make sure that cmpxchg's source is the same register.
+; CHECK: or [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 3, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_or_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:[^a].]]
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_or_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:e[^a].]]
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_or_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 3, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_or_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: or ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: or ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_or_32_ignored
+; Could just "lock or", if we inspect the liveness information first.
+; Would also need a way to introduce "lock"'edness to binary
+; operators without introducing overhead on the more common binary ops.
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:e[^a].]]
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+;; and
+
+define i32 @test_atomic_rmw_and_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 4, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_and_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 4, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_and_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_and_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_and_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 4, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_and_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: and ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: and ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_and_32_ignored
+; Could just "lock and"
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+;; xor
+
+define i32 @test_atomic_rmw_xor_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 5, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xor_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xor_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 5, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xor_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+
+define i32 @test_atomic_rmw_xor_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xor_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_xor_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 5, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xor_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: or ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: or ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xor_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_xor_32_ignored
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+;; exchange
+
+define i32 @test_atomic_rmw_xchg_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 6, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_8
+; CHECK: xchg byte ptr {{.*}}, [[REG:.*]]
+
+define i32 @test_atomic_rmw_xchg_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 6, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_16
+; CHECK: xchg word ptr {{.*}}, [[REG:.*]]
+
+define i32 @test_atomic_rmw_xchg_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_32
+; CHECK: xchg dword ptr {{.*}}, [[REG:.*]]
+
+define i64 @test_atomic_rmw_xchg_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 6, i64* %ptr, i64 %v, i32 6)
+  ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_64
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: lock cmpxchg8b qword ptr [{{e.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xchg_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; In this case, ignoring the return value doesn't help. The xchg is
+; used to do an atomic store.
+; CHECK-LABEL: test_atomic_rmw_xchg_32_ignored
+; CHECK: xchg dword ptr {{.*}}, [[REG:.*]]
 
 ;;;; Cmpxchg
 
-;define i32 @test_atomic_cmpxchg_8(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i8*
-;  %trunc_exp = trunc i32 %expected to i8
-;  %trunc_des = trunc i32 %desired to i8
-;  %old = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %trunc_exp,
-;                                              i8 %trunc_des, i32 6, i32 6)
-;  %old_ext = zext i8 %old to i32
-;  ret i32 %old_ext
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_8
-; CHECKLATER: lock cmpxchg byte
+define i32 @test_atomic_cmpxchg_8(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %trunc_exp = trunc i32 %expected to i8
+  %trunc_des = trunc i32 %desired to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %old = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %trunc_exp,
+                                              i8 %trunc_des, i32 6, i32 6)
+  %old_ext = zext i8 %old to i32
+  ret i32 %old_ext
+}
+; CHECK-LABEL: test_atomic_cmpxchg_8
+; CHECK: mov al, {{.*}}
+; Need to check that eax isn't used as the address register or the desired.
+; since it is already used as the *expected* register.
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], {{[^a]}}
 
-;define i32 @test_atomic_cmpxchg_16(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i16*
-;  %trunc_exp = trunc i32 %expected to i16
-;  %trunc_des = trunc i32 %desired to i16
-;  %old = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %trunc_exp,
-;                                               i16 %trunc_des, i32 6, i32 6)
-;  %old_ext = zext i16 %old to i32
-;  ret i32 %old_ext
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_16
-; This one is a bit gross for NaCl right now.
-; https://code.google.com/p/nativeclient/issues/detail?id=2981
-; But we'll assume that NaCl will have it fixed...
-; CHECKLATER: lock cmpxchg word
+define i32 @test_atomic_cmpxchg_16(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %trunc_exp = trunc i32 %expected to i16
+  %trunc_des = trunc i32 %desired to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %old = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %trunc_exp,
+                                               i16 %trunc_des, i32 6, i32 6)
+  %old_ext = zext i16 %old to i32
+  ret i32 %old_ext
+}
+; CHECK-LABEL: test_atomic_cmpxchg_16
+; CHECK: mov ax, {{.*}}
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}], {{[^a]}}
 
-;define i32 @test_atomic_cmpxchg_32(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
-;                                               i32 %desired, i32 6, i32 6)
-;  ret i32 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_32
-; CHECKLATER: mov eax
-; CHECKLATER: mov ecx
-; CHECKLATER: lock cmpxchg dword
+define i32 @test_atomic_cmpxchg_32(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                               i32 %desired, i32 6, i32 6)
+  ret i32 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32
+; CHECK: mov eax, {{.*}}
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
 
-;define i64 @test_atomic_cmpxchg_64(i32 %iptr, i64 %expected, i64 %desired) {
-;entry:
-;  %ptr = inttoptr i32 %iptr to i64*
-;  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
-;                                               i64 %desired, i32 6, i32 6)
-;  ret i64 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_64
-; CHECKLATER: mov eax
-; CHECKLATER: mov edx
-; CHECKLATER: mov ebx
-; CHECKLATER: mov ecx
-; CHECKLATER: lock cmpxchg8b qword
+define i64 @test_atomic_cmpxchg_64(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                               i64 %desired, i32 6, i32 6)
+  ret i64 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; edx and eax are already the return registers, so they don't actually
+; need to be reshuffled via movs. The next test stores the result
+; somewhere, so in that case they do need to be mov'ed.
 
-;define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr,
-;       i32 %expected, i32 %desired) {
-;entry:
-;  br label %loop
-;
-;loop:
-;  %cmp = phi i32 [ %expected, %entry], [%old, %loop]
-;  %ptr = inttoptr i32 %iptr to i32*
-;  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
-;                                               i32 %desired, i32 6, i32 6)
-;  %success = icmp eq i32 %cmp, %old
-;  br i1 %success, label %done, label %loop
-;
-;done:
-;  ret i32 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_32_loop
+; Test a case where %old really does need to be copied out of edx:eax.
+define void @test_atomic_cmpxchg_64_store(i32 %ret_iptr, i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                                i64 %desired, i32 6, i32 6)
+  %__6 = inttoptr i32 %ret_iptr to i64*
+  store i64 %old, i64* %__6, align 1
+  ret void
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_store
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: mov {{.*}}, edx
+; CHECK: mov {{.*}}, eax
+
+; Test with some more register pressure. When we have an alloca, ebp is
+; used to manage the stack frame, so it cannot be used as a register either.
+define i64 @test_atomic_cmpxchg_64_alloca(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %alloca_ptr = alloca i8, i32 16, align 16
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                                i64 %desired, i32 6, i32 6)
+  store i8 0, i8* %alloca_ptr, align 1
+  store i8 1, i8* %alloca_ptr, align 1
+  store i8 2, i8* %alloca_ptr, align 1
+  store i8 3, i8* %alloca_ptr, align 1
+  %__6 = ptrtoint i8* %alloca_ptr to i32
+  call void @use_ptr(i32 %__6)
+  ret i64 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_alloca
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It also cannot be ebp since we use that for alloca. Also make sure it's
+; not esp, since that's the stack pointer and mucking with it will break
+; the later use_ptr function call.
+; That pretty much leaves esi, or edi as the only viable registers.
+; CHECK: lock cmpxchg8b qword ptr [e{{[ds]}}i]
+; CHECK: call use_ptr
+
+define i32 @test_atomic_cmpxchg_32_ignored(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                                    i32 %desired, i32 6, i32 6)
+  ret i32 0
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32_ignored
+; CHECK: mov eax, {{.*}}
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+
+define i64 @test_atomic_cmpxchg_64_ignored(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %ignored = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                                    i64 %desired, i32 6, i32 6)
+  ret i64 0
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_ignored
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+
+define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %cmp = phi i32 [ %expected, %entry ], [ %old, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp eq i32 %cmp, %old
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32_loop
 
 ;;;; Fence and is-lock-free.
 
@@ -381,6 +866,19 @@
 ; CHECK-LABEL: test_not_lock_free
 ; CHECK: mov {{.*}}, 0
 
+define i32 @test_atomic_is_lock_free_ignored(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %ignored = call i1 @llvm.nacl.atomic.is.lock.free(i32 4, i8* %ptr)
+  ret i32 0
+}
+; CHECK-LABEL: test_atomic_is_lock_free_ignored
+; CHECK: mov {{.*}}, 0
+; This can get optimized out, because it's side-effect-free.
+; CHECKO2REM-LABEL: test_atomic_is_lock_free_ignored
+; CHECKO2REM-NOT: mov {{.*}}, 1
+; CHECKO2REM: mov {{.*}}, 0
+
 ; TODO(jvoung): at some point we can take advantage of the
 ; fact that nacl.atomic.is.lock.free will resolve to a constant
 ; (which adds DCE opportunities). Once we optimize, the test expectations