Add atomic load/store, fetch_add, fence, and is-lock-free lowering. Loads/stores w/ type i8, i16, and i32 are converted to plain load/store instructions and lowered w/ the plain lowerLoad/lowerStore. Atomic stores are followed by an mfence for sequential consistency. For 64-bit types, use movq to do 64-bit memory loads/stores (vs the usual load/store being broken into separate 32-bit load/stores). This means bitcasting the i64 -> f64, first (which splits the load of the value to be stored into two 32-bit ops) then stores in a single op. For load, load into f64 then bitcast back to i64 (which splits after the atomic load). This follows what GCC does for c++11 std::atomic<uint64_t> load/store methods (uses movq when -mfpmath=sse). This introduces some redundancy between movq and movsd, but the convention seems to be to use movq when working with integer quantities. Otherwise, movsd could work too. The difference seems to be in whether or not the XMM register's upper 64-bits are filled with 0 or not. Zero-extending could help avoid partial register stalls. Handle up to i32 fetch_add. TODO: add i64 via a cmpxchg loop. TODO: add some runnable crosstests to make sure that this doesn't do funny things to integer bit patterns that happen to look like signaling NaNs and quiet NaNs. However, the system clang would not know how to handle "llvm.nacl.*" if we choose to target that level directly via .ll files. Or, (a) we use old-school __sync methods (sync_fetch_and_add w/ 0 to load) or (b) require buildbot's clang/gcc to support c++11... BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/342763004

commit: 5cd240dfe9c68b985f3e5023d070b16dd4f516c4 [log] [tgz]
author: Jan Voung <jvoung@chromium.org> Wed Jun 25 10:36:46 2014 -0700
committer: Jan Voung <jvoung@chromium.org> Wed Jun 25 10:36:46 2014 -0700
tree: 2a1bb278f15d7280a9564c24dad181d82d013722
parent: 1ee34165b98a62ab1b9e5148bd8b03cf0e86741e [diff]
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 6477683..376d454 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp

@@ -166,6 +166,11 @@
   addSource(Src2);
 }
 
+InstX8632Mfence::InstX8632Mfence(Cfg *Func)
+    : InstX8632(Func, InstX8632::Mfence, 0, NULL) {
+  HasSideEffects = true;
+}
+
 InstX8632Store::InstX8632Store(Cfg *Func, Operand *Value, OperandX8632 *Mem)
     : InstX8632(Func, InstX8632::Store, 2, NULL) {
   addSource(Value);
@@ -177,6 +182,17 @@
   addSource(Source);
 }
 
+InstX8632StoreQ::InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem)
+    : InstX8632(Func, InstX8632::StoreQ, 2, NULL) {
+  addSource(Value);
+  addSource(Mem);
+}
+
+InstX8632Movq::InstX8632Movq(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Movq, 1, Dest) {
+  addSource(Source);
+}
+
 InstX8632Movsx::InstX8632Movsx(Cfg *Func, Variable *Dest, Operand *Source)
     : InstX8632(Func, InstX8632::Movsx, 1, Dest) {
   addSource(Source);
@@ -221,12 +237,34 @@
   return false;
 }
 
+bool InstX8632Movq::isRedundantAssign() const {
+  Variable *Src = llvm::dyn_cast<Variable>(getSrc(0));
+  if (Src == NULL)
+    return false;
+  if (getDest()->hasReg() && getDest()->getRegNum() == Src->getRegNum()) {
+    return true;
+  }
+  if (!getDest()->hasReg() && !Src->hasReg() &&
+      Dest->getStackOffset() == Src->getStackOffset())
+    return true;
+  return false;
+}
+
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
     : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
   if (Source)
     addSource(Source);
 }
 
+InstX8632Xadd::InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source,
+                             bool Locked)
+    : InstX8632(Func, InstX8632::Xadd, 2, llvm::dyn_cast<Variable>(Dest)),
+      Locked(Locked) {
+  HasSideEffects = Locked;
+  addSource(Dest);
+  addSource(Source);
+}
+
 // ======================== Dump routines ======================== //
 
 void InstX8632::dump(const Cfg *Func) const {
@@ -564,6 +602,17 @@
   dumpSources(Func);
 }
 
+void InstX8632Mfence::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  Str << "\tmfence\n";
+}
+
+void InstX8632Mfence::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "mfence\n";
+}
+
 void InstX8632Store::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
@@ -583,6 +632,26 @@
   getSrc(0)->dump(Func);
 }
 
+void InstX8632StoreQ::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  assert(getSrc(1)->getType() == IceType_i64 ||
+         getSrc(1)->getType() == IceType_f64);
+  Str << "\tmovq\t";
+  getSrc(1)->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632StoreQ::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "storeq." << getSrc(0)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  getSrc(0)->dump(Func);
+}
+
 void InstX8632Mov::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -611,6 +680,26 @@
   dumpSources(Func);
 }
 
+void InstX8632Movq::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->getType() == IceType_i64 ||
+         getDest()->getType() == IceType_f64);
+  Str << "\tmovq\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Movq::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "movq." << getDest()->getType() << " ";
+  dumpDest(Func);
+  Str << ", ";
+  dumpSources(Func);
+}
+
 void InstX8632Movsx::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -773,6 +862,29 @@
   dumpSources(Func);
 }
 
+void InstX8632Xadd::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  if (Locked) {
+    Str << "\tlock xadd ";
+  } else {
+    Str << "\txadd\t";
+  }
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Xadd::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (Locked) {
+    Str << "lock ";
+  }
+  Type Ty = getSrc(0)->getType();
+  Str << "xadd." << Ty << " ";
+  dumpSources(Func);
+}
+
 void OperandX8632::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "<OperandX8632>";

diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 7e6e199..54df869 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h

@@ -151,7 +151,9 @@
     Imul,
     Label,
     Load,
+    Mfence,
     Mov,
+    Movq,
     Movsx,
     Movzx,
     Mul,
@@ -167,11 +169,13 @@
     Shr,
     Shrd,
     Store,
+    StoreQ,
     Sub,
     Subss,
     Test,
     Ucomiss,
     UD2,
+    Xadd,
     Xor
   };
   static const char *getWidthString(Type Ty);
@@ -578,6 +582,23 @@
   virtual ~InstX8632Test() {}
 };
 
+// Mfence instruction.
+class InstX8632Mfence : public InstX8632 {
+public:
+  static InstX8632Mfence *create(Cfg *Func) {
+    return new (Func->allocate<InstX8632Mfence>()) InstX8632Mfence(Func);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Mfence); }
+
+private:
+  InstX8632Mfence(Cfg *Func);
+  InstX8632Mfence(const InstX8632Mfence &) LLVM_DELETED_FUNCTION;
+  InstX8632Mfence &operator=(const InstX8632Mfence &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Mfence() {}
+};
+
 // This is essentially a "mov" instruction with an OperandX8632Mem
 // operand instead of Variable as the destination.  It's important
 // for liveness that there is no Dest operand.
@@ -617,6 +638,45 @@
   virtual ~InstX8632Mov() {}
 };
 
+// This is essentially a "movq" instruction with an OperandX8632Mem
+// operand instead of Variable as the destination.  It's important
+// for liveness that there is no Dest operand.
+class InstX8632StoreQ : public InstX8632 {
+public:
+  static InstX8632StoreQ *create(Cfg *Func, Operand *Value, OperandX8632 *Mem) {
+    return new (Func->allocate<InstX8632StoreQ>())
+        InstX8632StoreQ(Func, Value, Mem);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, StoreQ); }
+
+private:
+  InstX8632StoreQ(Cfg *Func, Operand *Value, OperandX8632 *Mem);
+  InstX8632StoreQ(const InstX8632StoreQ &) LLVM_DELETED_FUNCTION;
+  InstX8632StoreQ &operator=(const InstX8632StoreQ &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632StoreQ() {}
+};
+
+// Movq - copy between XMM registers, or mem64 and XMM registers.
+class InstX8632Movq : public InstX8632 {
+public:
+  static InstX8632Movq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Movq>())
+        InstX8632Movq(Func, Dest, Source);
+  }
+  virtual bool isRedundantAssign() const;
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Movq); }
+
+private:
+  InstX8632Movq(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Movq(const InstX8632Movq &) LLVM_DELETED_FUNCTION;
+  InstX8632Movq &operator=(const InstX8632Movq &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Movq() {}
+};
+
 // Movsx - copy from a narrower integer type to a wider integer
 // type, with sign extension.
 class InstX8632Movsx : public InstX8632 {
@@ -744,6 +804,33 @@
   virtual ~InstX8632Ret() {}
 };
 
+// Exchanging Add instruction.  Exchanges the first operand (destination
+// operand) with the second operand (source operand), then loads the sum
+// of the two values into the destination operand. The destination may be
+// a register or memory, while the source must be a register.
+//
+// Both the dest and source are updated. The caller should then insert a
+// FakeDef to reflect the second udpate.
+class InstX8632Xadd : public InstX8632 {
+public:
+  static InstX8632Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
+                               bool Locked) {
+    return new (Func->allocate<InstX8632Xadd>())
+        InstX8632Xadd(Func, Dest, Source, Locked);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Xadd); }
+
+private:
+  bool Locked;
+
+  InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
+  InstX8632Xadd(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
+  InstX8632Xadd &operator=(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Xadd() {}
+};
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTX8632_H

diff --git a/src/IceIntrinsics.cpp b/src/IceIntrinsics.cpp
index dbf79cf..02562b5 100644
--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp

@@ -82,7 +82,7 @@
   {                                                                            \
     {                                                                          \
       { Intrinsics::AtomicStore, true }                                        \
-      , { IceType_void, Overload, IceType_i32, IceType_i32 }, 5                \
+      , { IceType_void, Overload, IceType_i32, IceType_i32 }, 4                \
     }                                                                          \
     , "nacl.atomic.store." NameSuffix                                          \
   }
@@ -199,4 +199,9 @@
   return &it->second;
 }
 
+bool Intrinsics::VerifyMemoryOrder(uint64_t Order) {
+  // There is only one memory ordering for atomics allowed right now.
+  return Order == Intrinsics::MemoryOrderSequentiallyConsistent;
+}
+
 } // end of namespace Ice

diff --git a/src/IceIntrinsics.h b/src/IceIntrinsics.h
index 4f9f7de..3fbff44 100644
--- a/src/IceIntrinsics.h
+++ b/src/IceIntrinsics.h

@@ -54,6 +54,39 @@
     Trap
   };
 
+  /// Operations that can be represented by the AtomicRMW
+  /// intrinsic.
+  ///
+  /// Do not reorder these values: their order offers forward
+  /// compatibility of bitcode targeted to PNaCl.
+  enum AtomicRMWOperation {
+    AtomicInvalid = 0, // Invalid, keep first.
+    AtomicAdd,
+    AtomicSub,
+    AtomicOr,
+    AtomicAnd,
+    AtomicXor,
+    AtomicExchange,
+    AtomicNum // Invalid, keep last.
+  };
+
+  /// Memory orderings supported by PNaCl IR.
+  ///
+  /// Do not reorder these values: their order offers forward
+  /// compatibility of bitcode targeted to PNaCl.
+  enum MemoryOrder {
+    MemoryOrderInvalid = 0, // Invalid, keep first.
+    MemoryOrderRelaxed,
+    MemoryOrderConsume,
+    MemoryOrderAcquire,
+    MemoryOrderRelease,
+    MemoryOrderAcquireRelease,
+    MemoryOrderSequentiallyConsistent,
+    MemoryOrderNum // Invalid, keep last.
+  };
+
+  static bool VerifyMemoryOrder(uint64_t Order);
+
   // Basic attributes related to each intrinsic, that are relevant to
   // code generation. We will want to have more attributes (e.g., Setjmp
   // returns twice and which affects stack coloring) once the lowering

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index af7b866..ef9bc22 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -431,7 +431,6 @@
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
 }
 
-// static
 Type TargetX8632::stackSlotType() { return IceType_i32; }
 
 void TargetX8632::addProlog(CfgNode *Node) {
@@ -1615,7 +1614,7 @@
       Variable *Spill = Func->makeVariable(IceType_f64, Context.getNode());
       Spill->setWeight(RegWeight::Zero);
       Spill->setPreferredRegister(llvm::dyn_cast<Variable>(Src0RM), true);
-      _mov(Spill, Src0RM);
+      _movq(Spill, Src0RM);
 
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
@@ -1658,7 +1657,7 @@
       _store(T_Lo, SpillLo);
       _mov(T_Hi, hiOperand(Src0));
       _store(T_Hi, SpillHi);
-      _mov(Dest, Spill);
+      _movq(Dest, Spill);
     } break;
     }
     break;
@@ -1800,16 +1799,140 @@
 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   switch (Instr->getIntrinsicInfo().ID) {
   case Intrinsics::AtomicCmpxchg:
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(3))->getValue())) {
+      Func->setError("Unexpected memory ordering (success) for AtomicCmpxchg");
+      return;
+    }
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(4))->getValue())) {
+      Func->setError("Unexpected memory ordering (failure) for AtomicCmpxchg");
+      return;
+    }
+    // TODO(jvoung): fill it in.
+    Func->setError("Unhandled intrinsic");
+    return;
   case Intrinsics::AtomicFence:
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(0))->getValue())) {
+      Func->setError("Unexpected memory ordering for AtomicFence");
+      return;
+    }
+    _mfence();
+    return;
   case Intrinsics::AtomicFenceAll:
-  case Intrinsics::AtomicIsLockFree:
-  case Intrinsics::AtomicLoad:
+    // NOTE: FenceAll should prevent and load/store from being moved
+    // across the fence (both atomic and non-atomic). The InstX8632Mfence
+    // instruction is currently marked coarsely as "HasSideEffects".
+    _mfence();
+    return;
+  case Intrinsics::AtomicIsLockFree: {
+    // X86 is always lock free for 8/16/32/64 bit accesses.
+    // TODO(jvoung): Since the result is constant when given a constant
+    // byte size, this opens up DCE opportunities.
+    Operand *ByteSize = Instr->getArg(0);
+    Variable *Dest = Instr->getDest();
+    if (ConstantInteger *CI = llvm::dyn_cast<ConstantInteger>(ByteSize)) {
+      Constant *Result;
+      switch (CI->getValue()) {
+      default:
+        // Some x86-64 processors support the cmpxchg16b intruction, which
+        // can make 16-byte operations lock free (when used with the LOCK
+        // prefix). However, that's not supported in 32-bit mode, so just
+        // return 0 even for large sizes.
+        Result = Ctx->getConstantZero(IceType_i32);
+        break;
+      case 1:
+      case 2:
+      case 4:
+      case 8:
+        Result = Ctx->getConstantInt(IceType_i32, 1);
+        break;
+      }
+      _mov(Dest, Result);
+      return;
+    }
+    // The PNaCl ABI requires the byte size to be a compile-time constant.
+    Func->setError("AtomicIsLockFree byte size should be compile-time const");
+    return;
+  }
+  case Intrinsics::AtomicLoad: {
+    // We require the memory address to be naturally aligned.
+    // Given that is the case, then normal loads are atomic.
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(1))->getValue())) {
+      Func->setError("Unexpected memory ordering for AtomicLoad");
+      return;
+    }
+    Variable *Dest = Instr->getDest();
+    if (Dest->getType() == IceType_i64) {
+      // Follow what GCC does and use a movq instead of what lowerLoad()
+      // normally does (split the load into two).
+      // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
+      // can't happen anyway, since this is x86-32 and integer arithmetic only
+      // happens on 32-bit quantities.
+      Variable *T = makeReg(IceType_f64);
+      OperandX8632Mem *Addr = FormMemoryOperand(Instr->getArg(0), IceType_f64);
+      _movq(T, Addr);
+      // Then cast the bits back out of the XMM register to the i64 Dest.
+      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
+      lowerCast(Cast);
+      // Make sure that the atomic load isn't elided.
+      Context.insert(InstFakeUse::create(Func, Dest->getLo()));
+      Context.insert(InstFakeUse::create(Func, Dest->getHi()));
+      return;
+    }
+    InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
+    lowerLoad(Load);
+    // Make sure the atomic load isn't elided.
+    Context.insert(InstFakeUse::create(Func, Dest));
+    return;
+  }
   case Intrinsics::AtomicRMW:
-  case Intrinsics::AtomicStore:
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(3))->getValue())) {
+      Func->setError("Unexpected memory ordering for AtomicRMW");
+      return;
+    }
+    lowerAtomicRMW(Instr->getDest(),
+                   static_cast<uint32_t>(llvm::cast<ConstantInteger>(
+                       Instr->getArg(0))->getValue()),
+                   Instr->getArg(1), Instr->getArg(2));
+    return;
+  case Intrinsics::AtomicStore: {
+    if (!Intrinsics::VerifyMemoryOrder(
+             llvm::cast<ConstantInteger>(Instr->getArg(2))->getValue())) {
+      Func->setError("Unexpected memory ordering for AtomicStore");
+      return;
+    }
+    // We require the memory address to be naturally aligned.
+    // Given that is the case, then normal stores are atomic.
+    // Add a fence after the store to make it visible.
+    Operand *Value = Instr->getArg(0);
+    Operand *Ptr = Instr->getArg(1);
+    if (Value->getType() == IceType_i64) {
+      // Use a movq instead of what lowerStore() normally does
+      // (split the store into two), following what GCC does.
+      // Cast the bits from int -> to an xmm register first.
+      Variable *T = makeReg(IceType_f64);
+      InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
+      lowerCast(Cast);
+      // Then store XMM w/ a movq.
+      OperandX8632Mem *Addr = FormMemoryOperand(Ptr, IceType_f64);
+      _storeq(T, Addr);
+      _mfence();
+      return;
+    }
+    InstStore *Store = InstStore::create(Func, Value, Ptr);
+    lowerStore(Store);
+    _mfence();
+    return;
+  }
   case Intrinsics::Bswap:
   case Intrinsics::Ctlz:
   case Intrinsics::Ctpop:
   case Intrinsics::Cttz:
+    // TODO(jvoung): fill it in.
     Func->setError("Unhandled intrinsic");
     return;
   case Intrinsics::Longjmp: {
@@ -1817,7 +1940,7 @@
     Call->addArg(Instr->getArg(0));
     Call->addArg(Instr->getArg(1));
     lowerCall(Call);
-    break;
+    return;
   }
   case Intrinsics::Memcpy: {
     // In the future, we could potentially emit an inline memcpy/memset, etc.
@@ -1827,7 +1950,7 @@
     Call->addArg(Instr->getArg(1));
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
-    break;
+    return;
   }
   case Intrinsics::Memmove: {
     InstCall *Call = makeHelperCall("memmove", NULL, 3);
@@ -1835,7 +1958,7 @@
     Call->addArg(Instr->getArg(1));
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
-    break;
+    return;
   }
   case Intrinsics::Memset: {
     // The value operand needs to be extended to a stack slot size
@@ -1849,32 +1972,33 @@
     Call->addArg(ValExt);
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
-    break;
+    return;
   }
   case Intrinsics::NaClReadTP: {
-    Constant *Zero = Ctx->getConstantInt(IceType_i32, 0);
+    Constant *Zero = Ctx->getConstantZero(IceType_i32);
     Operand *Src = OperandX8632Mem::create(Func, IceType_i32, NULL, Zero, NULL,
                                            0, OperandX8632Mem::SegReg_GS);
     Variable *Dest = Instr->getDest();
     Variable *T = NULL;
     _mov(T, Src);
     _mov(Dest, T);
-    break;
+    return;
   }
   case Intrinsics::Setjmp: {
     InstCall *Call = makeHelperCall("setjmp", Instr->getDest(), 1);
     Call->addArg(Instr->getArg(0));
     lowerCall(Call);
-    break;
+    return;
   }
   case Intrinsics::Sqrt:
   case Intrinsics::Stacksave:
   case Intrinsics::Stackrestore:
+    // TODO(jvoung): fill it in.
     Func->setError("Unhandled intrinsic");
     return;
   case Intrinsics::Trap:
     _ud2();
-    break;
+    return;
   case Intrinsics::UnknownIntrinsic:
     Func->setError("Should not be lowering UnknownIntrinsic");
     return;
@@ -1882,6 +2006,51 @@
   return;
 }
 
+void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
+                                 Operand *Ptr, Operand *Val) {
+  switch (Operation) {
+  default:
+    Func->setError("Unknown AtomicRMW operation");
+    return;
+  case Intrinsics::AtomicAdd: {
+    if (Dest->getType() == IceType_i64) {
+      // Do a nasty cmpxchg8b loop. Factor this into a function.
+      // TODO(jvoung): fill it in.
+      Func->setError("Unhandled AtomicRMW operation");
+      return;
+    }
+    OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+    const bool Locked = true;
+    Variable *T = NULL;
+    _mov(T, Val);
+    _xadd(Addr, T, Locked);
+    _mov(Dest, T);
+    return;
+  }
+  case Intrinsics::AtomicSub: {
+    if (Dest->getType() == IceType_i64) {
+      // Do a nasty cmpxchg8b loop.
+      // TODO(jvoung): fill it in.
+      Func->setError("Unhandled AtomicRMW operation");
+      return;
+    }
+    // Generate a memory operand from Ptr.
+    // neg...
+    // Then do the same as AtomicAdd.
+    // TODO(jvoung): fill it in.
+    Func->setError("Unhandled AtomicRMW operation");
+    return;
+  }
+  case Intrinsics::AtomicOr:
+  case Intrinsics::AtomicAnd:
+  case Intrinsics::AtomicXor:
+  case Intrinsics::AtomicExchange:
+    // TODO(jvoung): fill it in.
+    Func->setError("Unhandled AtomicRMW operation");
+    return;
+  }
+}
+
 namespace {
 
 bool isAdd(const Inst *Inst) {
@@ -2018,15 +2187,7 @@
   // optimization already creates an OperandX8632Mem operand, so it
   // doesn't need another level of transformation.
   Type Ty = Inst->getDest()->getType();
-  Operand *Src0 = Inst->getSourceAddress();
-  // Address mode optimization already creates an OperandX8632Mem
-  // operand, so it doesn't need another level of transformation.
-  if (!llvm::isa<OperandX8632Mem>(Src0)) {
-    Variable *Base = llvm::dyn_cast<Variable>(Src0);
-    Constant *Offset = llvm::dyn_cast<Constant>(Src0);
-    assert(Base || Offset);
-    Src0 = OperandX8632Mem::create(Func, Ty, Base, Offset);
-  }
+  Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty);
 
   // Fuse this load with a subsequent Arithmetic instruction in the
   // following situations:
@@ -2034,6 +2195,8 @@
   //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
   //
   // TODO: Clean up and test thoroughly.
+  // (E.g., if there is an mfence-all make sure the load ends up on the
+  // same side of the fence).
   //
   // TODO: Why limit to Arithmetic instructions?  This could probably be
   // applied to most any instruction type.  Look at all source operands
@@ -2164,19 +2327,7 @@
 void TargetX8632::lowerStore(const InstStore *Inst) {
   Operand *Value = Inst->getData();
   Operand *Addr = Inst->getAddr();
-  OperandX8632Mem *NewAddr = llvm::dyn_cast<OperandX8632Mem>(Addr);
-  // Address mode optimization already creates an OperandX8632Mem
-  // operand, so it doesn't need another level of transformation.
-  if (!NewAddr) {
-    // The address will be either a constant (which represents a global
-    // variable) or a variable, so either the Base or Offset component
-    // of the OperandX8632Mem will be set.
-    Variable *Base = llvm::dyn_cast<Variable>(Addr);
-    Constant *Offset = llvm::dyn_cast<Constant>(Addr);
-    assert(Base || Offset);
-    NewAddr = OperandX8632Mem::create(Func, Value->getType(), Base, Offset);
-  }
-  NewAddr = llvm::cast<OperandX8632Mem>(legalize(NewAddr));
+  OperandX8632Mem *NewAddr = FormMemoryOperand(Addr, Value->getType());
 
   if (NewAddr->getType() == IceType_i64) {
     Value = legalize(Value);
@@ -2294,10 +2445,11 @@
       // need to go in uninitialized registers.
       From = Ctx->getConstantZero(From->getType());
     }
-    bool NeedsReg = !(Allowed & Legal_Imm) ||
+    bool NeedsReg =
+        !(Allowed & Legal_Imm) ||
         // ConstantFloat and ConstantDouble are actually memory operands.
-        (!(Allowed & Legal_Mem) && (From->getType() == IceType_f32 ||
-                                    From->getType() == IceType_f64));
+        (!(Allowed & Legal_Mem) &&
+         (From->getType() == IceType_f32 || From->getType() == IceType_f64));
     if (NeedsReg) {
       Variable *Reg = makeReg(From->getType(), RegNum);
       _mov(Reg, From);
@@ -2330,6 +2482,20 @@
   return llvm::cast<Variable>(legalize(From, Legal_Reg, AllowOverlap, RegNum));
 }
 
+OperandX8632Mem *TargetX8632::FormMemoryOperand(Operand *Operand, Type Ty) {
+  OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
+  // It may be the case that address mode optimization already creates
+  // an OperandX8632Mem, so in that case it wouldn't need another level
+  // of transformation.
+  if (!Mem) {
+    Variable *Base = llvm::dyn_cast<Variable>(Operand);
+    Constant *Offset = llvm::dyn_cast<Constant>(Operand);
+    assert(Base || Offset);
+    Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
+  }
+  return llvm::cast<OperandX8632Mem>(legalize(Mem));
+}
+
 Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for x86-32.
   assert(Type != IceType_i64);

diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 7902136..972b29f 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h

@@ -94,6 +94,9 @@
   virtual void doAddressOptLoad();
   virtual void doAddressOptStore();
 
+  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                      Operand *Val);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -114,6 +117,10 @@
                     int32_t RegNum = Variable::NoRegister);
   Variable *legalizeToVar(Operand *From, bool AllowOverlap = false,
                           int32_t RegNum = Variable::NoRegister);
+  // Turn a pointer operand into a memory operand that can be
+  // used by a real load/store operation. Legalizes the operand as well.
+  // This is a nop if the operand is already a legal memory operand.
+  OperandX8632Mem *FormMemoryOperand(Operand *Ptr, Type Ty);
 
   Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
   InstCall *makeHelperCall(const IceString &Name, Variable *Dest,
@@ -180,6 +187,7 @@
   void _imul(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Imul::create(Func, Dest, Src0));
   }
+  void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
   // If Dest=NULL is passed in, then a new variable is created, marked
   // as infinite register allocation weight, and returned through the
   // in/out Dest argument.
@@ -191,6 +199,9 @@
       Context.insert(InstX8632Mov::create(Func, Dest, Src0));
     }
   }
+  void _movq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movq::create(Func, Dest, Src0));
+  }
   void _movsx(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
   }
@@ -236,6 +247,9 @@
   void _store(Operand *Value, OperandX8632 *Mem) {
     Context.insert(InstX8632Store::create(Func, Value, Mem));
   }
+  void _storeq(Operand *Value, OperandX8632 *Mem) {
+    Context.insert(InstX8632StoreQ::create(Func, Value, Mem));
+  }
   void _sub(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Sub::create(Func, Dest, Src0));
   }
@@ -249,6 +263,12 @@
     Context.insert(InstX8632Ucomiss::create(Func, Src0, Src1));
   }
   void _ud2() { Context.insert(InstX8632UD2::create(Func)); }
+  void _xadd(Operand *Dest, Variable *Src, bool Locked) {
+    Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
+    // The xadd exchanges Dest and Src (modifying Src).
+    // Model that update with a FakeDef.
+    Context.insert(InstFakeDef::create(Func, Src));
+  }
   void _xor(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Xor::create(Func, Dest, Src0));
   }

diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index 2b323f7..c3a98df 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp

@@ -599,8 +599,6 @@
       }
     }
     if (Call->getNumArgs() + 1 != I->NumTypes) {
-      std::cerr << "Call->getNumArgs() " << (int)Call->getNumArgs()
-                << " I->NumTypes " << (int)I->NumTypes << "\n";
       report_fatal_error("Mismatched # of args.");
     }
     for (size_t i = 1; i < I->NumTypes; ++i) {

diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll b/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll
new file mode 100644
index 0000000..581b73c
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-errors.ll

@@ -0,0 +1,169 @@
+; Test that some errors trigger when the usage of NaCl atomic
+; intrinsics does not match the required ABI.
+
+; RUN: not %llvm2ice --verbose none %s 2>&1 | FileCheck %s
+
+declare i8 @llvm.nacl.atomic.load.i8(i8*, i32)
+declare i16 @llvm.nacl.atomic.load.i16(i16*, i32)
+declare i64 @llvm.nacl.atomic.load.i64(i64*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare void @llvm.nacl.atomic.store.i64(i64, i64*, i32)
+declare i8 @llvm.nacl.atomic.rmw.i8(i32, i8*, i8, i32)
+declare i16 @llvm.nacl.atomic.rmw.i16(i32, i16*, i16, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+declare i64 @llvm.nacl.atomic.rmw.i64(i32, i64*, i64, i32)
+declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
+declare i64 @llvm.nacl.atomic.cmpxchg.i64(i64*, i64, i64, i32, i32)
+declare void @llvm.nacl.atomic.fence(i32)
+declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
+
+;;; Load
+;;; Check unexpected memory order parameter (only sequential
+;;; consistency == 6 is currently allowed).
+
+define i32 @error_atomic_load_8(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 0)
+  %r = zext i8 %i to i32
+  ret i32 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+define i32 @error_atomic_load_16(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i16*
+  %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 1)
+  %r = zext i16 %i to i32
+  ret i32 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+define i64 @error_atomic_load_64(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %r = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 2)
+  ret i64 %r
+}
+; CHECK: Unexpected memory ordering for AtomicLoad
+
+
+;;; Store
+
+define void @error_atomic_store_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  call void @llvm.nacl.atomic.store.i32(i32 %v, i32* %ptr, i32 2)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+define void @error_atomic_store_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 %v, i64* %ptr, i32 3)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+define void @error_atomic_store_64_const(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 12345678901234, i64* %ptr, i32 4)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicStore
+
+;;; RMW
+;;; Test atomic memory order and operation.
+
+define i32 @error_atomic_rmw_add_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 1, i8* %ptr, i8 %trunc, i32 5)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK: Unexpected memory ordering for AtomicRMW
+
+define i64 @error_atomic_rmw_add_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 4)
+  ret i64 %a
+}
+; CHECK: Unexpected memory ordering for AtomicRMW
+
+define i32 @error_atomic_rmw_add_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 0, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK: Unknown AtomicRMW operation
+
+define i32 @error_atomic_rmw_add_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 7, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK: Unknown AtomicRMW operation
+
+define i32 @error_atomic_rmw_add_32_max(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4294967295, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK: Unknown AtomicRMW operation
+
+;;; Cmpxchg
+
+define i32 @error_atomic_cmpxchg_32_success(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                               i32 %desired, i32 0, i32 6)
+  ret i32 %old
+}
+; CHECK: Unexpected memory ordering (success) for AtomicCmpxchg
+
+define i32 @error_atomic_cmpxchg_32_failure(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+                                               i32 %desired, i32 6, i32 0)
+  ret i32 %old
+}
+; CHECK: Unexpected memory ordering (failure) for AtomicCmpxchg
+
+define i64 @error_atomic_cmpxchg_64_failure(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+                                               i64 %desired, i32 6, i32 3)
+  ret i64 %old
+}
+; CHECK: Unexpected memory ordering (failure) for AtomicCmpxchg
+
+;;; Fence and is-lock-free.
+
+define void @error_atomic_fence() {
+entry:
+  call void @llvm.nacl.atomic.fence(i32 1)
+  ret void
+}
+; CHECK: Unexpected memory ordering for AtomicFence
+
+define i32 @error_atomic_is_lock_free_var(i32 %iptr, i32 %bs) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i1 @llvm.nacl.atomic.is.lock.free(i32 %bs, i8* %ptr)
+  %r = zext i1 %i to i32
+  ret i32 %r
+}
+; CHECK: AtomicIsLockFree byte size should be compile-time const

diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
new file mode 100644
index 0000000..32c5e85
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll

@@ -0,0 +1,216 @@
+; Test that loads/stores don't move across a nacl.atomic.fence.all.
+; This should apply to both atomic and non-atomic loads/stores
+; (unlike the non-"all" variety of nacl.atomic.fence, which only
+; applies to atomic load/stores).
+;
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+
+declare void @llvm.nacl.atomic.fence.all()
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+
+@g32_a = internal global [4 x i8] zeroinitializer, align 4
+@g32_b = internal global [4 x i8] zeroinitializer, align 4
+@g32_c = internal global [4 x i8] zeroinitializer, align 4
+@g32_d = internal global [4 x i8] c"\02\00\00\00", align 4
+
+define i32 @test_fused_load_add_a() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  call void @llvm.nacl.atomic.fence.all()
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_a
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; The load + add are optimized into one everywhere.
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mfence
+; CHECK: mov dword ptr
+
+; Test with the fence moved up a bit.
+define i32 @test_fused_load_add_b() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  call void @llvm.nacl.atomic.fence.all()
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_b
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: mfence
+; Load + add can still be optimized into one instruction
+; because it is not separated by a fence.
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+
+; Test with the fence splitting a load/add.
+define i32 @test_fused_load_add_c() {
+entry:
+  %p_alloca = alloca i8, i32 4, align 4
+  %p_alloca_bc = bitcast i8* %p_alloca to i32*
+  store i32 999, i32* %p_alloca_bc, align 1
+
+  %p_a = bitcast [4 x i8]* @g32_a to i32*
+  %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
+  %l_a2 = add i32 %l_a, 1
+  call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
+
+  %p_b = bitcast [4 x i8]* @g32_b to i32*
+  %l_b = load i32* %p_b
+  call void @llvm.nacl.atomic.fence.all()
+  %l_b2 = add i32 %l_b, 1
+  store i32 %l_b2, i32* %p_b, align 1
+
+  %p_c = bitcast [4 x i8]* @g32_c to i32*
+  %l_c = load i32* %p_c
+  %l_c2 = add i32 %l_c, 1
+  store i32 %l_c2, i32* %p_c, align 1
+
+  ret i32 %l_c2
+}
+; CHECK-LABEL: test_fused_load_add_c
+;    alloca store
+; CHECK: mov {{.*}}, esp
+; CHECK: mov dword ptr {{.*}}, 999
+;    atomic store (w/ its own mfence)
+; CHECK: mov {{.*}}, g32_a
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, g32_b
+; This load + add are no longer optimized into one,
+; though perhaps it should be legal as long as
+; the load stays on the same side of the fence.
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: mfence
+; CHECK: add {{.*}}, 1
+; CHECK: mov dword ptr
+; CHECK: mov {{.*}}, g32_c
+; CHECK: add {{.*}}, dword ptr
+; CHECK: mov dword ptr
+
+
+; Test where a bunch of i8 loads could have been fused into one
+; i32 load, but a fence blocks that.
+define i32 @could_have_fused_loads() {
+entry:
+  %ptr1 = bitcast [4 x i8]* @g32_d to i8*
+  %b1 = load i8* %ptr1
+
+  %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32
+  %int_ptr_bump2 = add i32 %int_ptr2, 1
+  %ptr2 = inttoptr i32 %int_ptr_bump2 to i8*
+  %b2 = load i8* %ptr2
+
+  %int_ptr_bump3 = add i32 %int_ptr2, 2
+  %ptr3 = inttoptr i32 %int_ptr_bump3 to i8*
+  %b3 = load i8* %ptr3
+
+  call void @llvm.nacl.atomic.fence.all()
+
+  %int_ptr_bump4 = add i32 %int_ptr2, 3
+  %ptr4 = inttoptr i32 %int_ptr_bump4 to i8*
+  %b4 = load i8* %ptr4
+
+  %b1.ext = zext i8 %b1 to i32
+  %b2.ext = zext i8 %b2 to i32
+  %b2.shift = shl i32 %b2.ext, 8
+  %b12 = or i32 %b1.ext, %b2.shift
+  %b3.ext = zext i8 %b3 to i32
+  %b3.shift = shl i32 %b3.ext, 16
+  %b123 = or i32 %b12, %b3.shift
+  %b4.ext = zext i8 %b4 to i32
+  %b4.shift = shl i32 %b4.ext, 24
+  %b1234 = or i32 %b123, %b4.shift
+  ret i32 %b1234
+}
+; CHECK-LABEL: could_have_fused_loads
+; CHECK: mov {{.*}}, g32_d
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mov {{.*}}, byte ptr
+; CHECK: mfence
+; CHECK: mov {{.*}}, byte ptr
+
+
+; Test where an identical load from two branches could have been hoisted
+; up, and then the code merged, but a fence prevents it.
+define i32 @could_have_hoisted_loads(i32 %x) {
+entry:
+  %ptr = bitcast [4 x i8]* @g32_d to i32*
+  %cmp = icmp eq i32 %x, 1
+  br i1 %cmp, label %branch1, label %branch2
+branch1:
+  %y = load i32* %ptr
+  ret i32 %y
+branch2:
+  call void @llvm.nacl.atomic.fence.all()
+  %z = load i32* %ptr
+  ret i32 %z
+}
+; CHECK-LABEL: could_have_hoisted_loads
+; CHECK: mov {{.*}}, g32_d
+; CHECK: je {{.*}}
+; CHECK: jmp {{.*}}
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: ret
+; CHECK: mfence
+; CHECK: mov {{.*}}, dword ptr
+; CHECK: ret

diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
new file mode 100644
index 0000000..8dfcc61
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll

@@ -0,0 +1,409 @@
+; This tests each of the supported NaCl atomic instructions for every
+; size allowed.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+declare i8 @llvm.nacl.atomic.load.i8(i8*, i32)
+declare i16 @llvm.nacl.atomic.load.i16(i16*, i32)
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare i64 @llvm.nacl.atomic.load.i64(i64*, i32)
+declare void @llvm.nacl.atomic.store.i8(i8, i8*, i32)
+declare void @llvm.nacl.atomic.store.i16(i16, i16*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare void @llvm.nacl.atomic.store.i64(i64, i64*, i32)
+declare i8 @llvm.nacl.atomic.rmw.i8(i32, i8*, i8, i32)
+declare i16 @llvm.nacl.atomic.rmw.i16(i32, i16*, i16, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+declare i64 @llvm.nacl.atomic.rmw.i64(i32, i64*, i64, i32)
+declare i8 @llvm.nacl.atomic.cmpxchg.i8(i8*, i8, i8, i32, i32)
+declare i16 @llvm.nacl.atomic.cmpxchg.i16(i16*, i16, i16, i32, i32)
+declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
+declare i64 @llvm.nacl.atomic.cmpxchg.i64(i64*, i64, i64, i32, i32)
+declare void @llvm.nacl.atomic.fence(i32)
+declare void @llvm.nacl.atomic.fence.all()
+declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
+
+;;; Load
+
+; x86 guarantees load/store to be atomic if naturally aligned.
+; The PNaCl IR requires all atomic accesses to be naturally aligned.
+
+define i32 @test_atomic_load_8(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  ; parameter value "6" is for the sequential consistency memory order.
+  %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
+  %r = zext i8 %i to i32
+  ret i32 %r
+}
+; CHECK-LABEL: test_atomic_load_8
+; CHECK: mov {{.*}}, dword
+; CHECK: mov {{.*}}, byte
+
+define i32 @test_atomic_load_16(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i16*
+  %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
+  %r = zext i16 %i to i32
+  ret i32 %r
+}
+; CHECK-LABEL: test_atomic_load_16
+; CHECK: mov {{.*}}, dword
+; CHECK: mov {{.*}}, word
+
+define i32 @test_atomic_load_32(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  ret i32 %r
+}
+; CHECK-LABEL: test_atomic_load_32
+; CHECK: mov {{.*}}, dword
+; CHECK: mov {{.*}}, dword
+
+define i64 @test_atomic_load_64(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %r = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 6)
+  ret i64 %r
+}
+; CHECK-LABEL: test_atomic_load_64
+; CHECK: movq x{{.*}}, qword
+; CHECK: movq qword {{.*}}, x{{.*}}
+
+define i32 @test_atomic_load_32_with_arith(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %r2 = add i32 %r, 32
+  ret i32 %r2
+}
+; CHECK-LABEL: test_atomic_load_32_with_arith
+; CHECK: mov {{.*}}, dword
+; The next instruction may be a separate load or folded into an add.
+
+define i32 @test_atomic_load_32_ignored(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  ret i32 0
+}
+; CHECK-LABEL: test_atomic_load_32_ignored
+; CHECK: mov {{.*}}, dword
+; CHECK: mov {{.*}}, dword
+
+define i64 @test_atomic_load_64_ignored(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  %ignored = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 6)
+  ret i64 0
+}
+; CHECK-LABEL: test_atomic_load_64_ignored
+; CHECK: movq x{{.*}}, qword
+; CHECK: movq qword {{.*}}, x{{.*}}
+
+
+;;; Store
+
+define void @test_atomic_store_8(i32 %iptr, i32 %v) {
+entry:
+  %truncv = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  call void @llvm.nacl.atomic.store.i8(i8 %truncv, i8* %ptr, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_8
+; CHECK: mov byte
+; CHECK: mfence
+
+define void @test_atomic_store_16(i32 %iptr, i32 %v) {
+entry:
+  %truncv = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  call void @llvm.nacl.atomic.store.i16(i16 %truncv, i16* %ptr, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_16
+; CHECK: mov word
+; CHECK: mfence
+
+define void @test_atomic_store_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  call void @llvm.nacl.atomic.store.i32(i32 %v, i32* %ptr, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_32
+; CHECK: mov dword
+; CHECK: mfence
+
+define void @test_atomic_store_64(i32 %iptr, i64 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 %v, i64* %ptr, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_64
+; CHECK: movq x{{.*}}, qword
+; CHECK: movq qword {{.*}}, x{{.*}}
+; CHECK: mfence
+
+define void @test_atomic_store_64_const(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i64*
+  call void @llvm.nacl.atomic.store.i64(i64 12345678901234, i64* %ptr, i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_store_64_const
+; CHECK: mov {{.*}}, 1942892530
+; CHECK: mov {{.*}}, 2874
+; CHECK: movq x{{.*}}, qword
+; CHECK: movq qword {{.*}}, x{{.*}}
+; CHECK: mfence
+
+
+;;; RMW
+
+define i32 @test_atomic_rmw_add_8(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i8
+  %ptr = inttoptr i32 %iptr to i8*
+  ; "1" is an atomic add, and "6" is sequential consistency.
+  %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 1, i8* %ptr, i8 %trunc, i32 6)
+  %a_ext = zext i8 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_add_8
+; CHECK: lock xadd byte {{.*}}, [[REG:.*]]
+; CHECK: mov {{.*}}, {{.*}}[[REG]]
+
+define i32 @test_atomic_rmw_add_16(i32 %iptr, i32 %v) {
+entry:
+  %trunc = trunc i32 %v to i16
+  %ptr = inttoptr i32 %iptr to i16*
+  %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 1, i16* %ptr, i16 %trunc, i32 6)
+  %a_ext = zext i16 %a to i32
+  ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_add_16
+; CHECK: lock xadd word {{.*}}, [[REG:.*]]
+; CHECK: mov {{.*}}, {{.*}}[[REG]]
+
+define i32 @test_atomic_rmw_add_32(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %v, i32 6)
+  ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_add_32
+; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
+; CHECK: mov {{.*}}, {{.*}}[[REG]]
+
+;define i64 @test_atomic_rmw_add_64(i32 %iptr, i64 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i64*
+;  %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
+;  ret i64 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_add_64
+; CHECKLATER: uh need a... cmpxchg8b loop.
+
+define i32 @test_atomic_rmw_add_32_ignored(i32 %iptr, i32 %v) {
+entry:
+  %ptr = inttoptr i32 %iptr to i32*
+  %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %v, i32 6)
+  ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_add_32_ignored
+; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
+
+;define i32 @test_atomic_rmw_sub_32(i32 %iptr, i32 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
+;  ret i32 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_sub_32
+; CHECKLATER: neg
+; CHECKLATER: lock
+; CHECKLATER: xadd
+
+;define i32 @test_atomic_rmw_or_32(i32 %iptr, i32 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
+;  ret i32 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_or_32
+; Need a cmpxchg loop.
+
+;define i32 @test_atomic_rmw_and_32(i32 %iptr, i32 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
+;  ret i32 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_and_32
+; Also a cmpxchg loop.
+
+;define i32 @test_atomic_rmw_xor_32(i32 %iptr, i32 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
+;  ret i32 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_xor_32
+; Also a cmpxchg loop.
+
+;define i32 @test_atomic_rmw_xchg_32(i32 %iptr, i32 %v) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
+;  ret i32 %a
+;}
+; CHECKLATER-LABEL: test_atomic_rmw_xchg_32
+
+;;;; Cmpxchg
+
+;define i32 @test_atomic_cmpxchg_8(i32 %iptr, i32 %expected, i32 %desired) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i8*
+;  %trunc_exp = trunc i32 %expected to i8
+;  %trunc_des = trunc i32 %desired to i8
+;  %old = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %trunc_exp,
+;                                              i8 %trunc_des, i32 6, i32 6)
+;  %old_ext = zext i8 %old to i32
+;  ret i32 %old_ext
+;}
+; CHECKLATER-LABEL: test_atomic_cmpxchg_8
+; CHECKLATER: lock cmpxchg byte
+
+;define i32 @test_atomic_cmpxchg_16(i32 %iptr, i32 %expected, i32 %desired) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i16*
+;  %trunc_exp = trunc i32 %expected to i16
+;  %trunc_des = trunc i32 %desired to i16
+;  %old = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %trunc_exp,
+;                                               i16 %trunc_des, i32 6, i32 6)
+;  %old_ext = zext i16 %old to i32
+;  ret i32 %old_ext
+;}
+; CHECKLATER-LABEL: test_atomic_cmpxchg_16
+; This one is a bit gross for NaCl right now.
+; https://code.google.com/p/nativeclient/issues/detail?id=2981
+; But we'll assume that NaCl will have it fixed...
+; CHECKLATER: lock cmpxchg word
+
+;define i32 @test_atomic_cmpxchg_32(i32 %iptr, i32 %expected, i32 %desired) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+;                                               i32 %desired, i32 6, i32 6)
+;  ret i32 %old
+;}
+; CHECKLATER-LABEL: test_atomic_cmpxchg_32
+; CHECKLATER: mov eax
+; CHECKLATER: mov ecx
+; CHECKLATER: lock cmpxchg dword
+
+;define i64 @test_atomic_cmpxchg_64(i32 %iptr, i64 %expected, i64 %desired) {
+;entry:
+;  %ptr = inttoptr i32 %iptr to i64*
+;  %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+;                                               i64 %desired, i32 6, i32 6)
+;  ret i64 %old
+;}
+; CHECKLATER-LABEL: test_atomic_cmpxchg_64
+; CHECKLATER: mov eax
+; CHECKLATER: mov edx
+; CHECKLATER: mov ebx
+; CHECKLATER: mov ecx
+; CHECKLATER: lock cmpxchg8b qword
+
+;define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr,
+;       i32 %expected, i32 %desired) {
+;entry:
+;  br label %loop
+;
+;loop:
+;  %cmp = phi i32 [ %expected, %entry], [%old, %loop]
+;  %ptr = inttoptr i32 %iptr to i32*
+;  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
+;                                               i32 %desired, i32 6, i32 6)
+;  %success = icmp eq i32 %cmp, %old
+;  br i1 %success, label %done, label %loop
+;
+;done:
+;  ret i32 %old
+;}
+; CHECKLATER-LABEL: test_atomic_cmpxchg_32_loop
+
+;;;; Fence and is-lock-free.
+
+define void @test_atomic_fence() {
+entry:
+  call void @llvm.nacl.atomic.fence(i32 6)
+  ret void
+}
+; CHECK-LABEL: test_atomic_fence
+; CHECK: mfence
+
+define void @test_atomic_fence_all() {
+entry:
+  call void @llvm.nacl.atomic.fence.all()
+  ret void
+}
+; CHECK-LABEL: test_atomic_fence_all
+; CHECK: mfence
+
+define i32 @test_atomic_is_lock_free(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i1 @llvm.nacl.atomic.is.lock.free(i32 4, i8* %ptr)
+  %r = zext i1 %i to i32
+  ret i32 %r
+}
+; CHECK-LABEL: test_atomic_is_lock_free
+; CHECK: mov {{.*}}, 1
+
+define i32 @test_not_lock_free(i32 %iptr) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i1 @llvm.nacl.atomic.is.lock.free(i32 7, i8* %ptr)
+  %r = zext i1 %i to i32
+  ret i32 %r
+}
+; CHECK-LABEL: test_not_lock_free
+; CHECK: mov {{.*}}, 0
+
+; TODO(jvoung): at some point we can take advantage of the
+; fact that nacl.atomic.is.lock.free will resolve to a constant
+; (which adds DCE opportunities). Once we optimize, the test expectations
+; for this case should change.
+define i32 @test_atomic_is_lock_free_can_dce(i32 %iptr, i32 %x, i32 %y) {
+entry:
+  %ptr = inttoptr i32 %iptr to i8*
+  %i = call i1 @llvm.nacl.atomic.is.lock.free(i32 4, i8* %ptr)
+  %i_ext = zext i1 %i to i32
+  %cmp = icmp eq i32 %i_ext, 1
+  br i1 %cmp, label %lock_free, label %not_lock_free
+lock_free:
+  ret i32 %i_ext
+
+not_lock_free:
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+; CHECK-LABEL: test_atomic_is_lock_free_can_dce
+; CHECK: mov {{.*}}, 1
+; CHECK: ret
+; CHECK: add
+; CHECK: ret
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
commit	5cd240dfe9c68b985f3e5023d070b16dd4f516c4	[log] [tgz]
author	Jan Voung <jvoung@chromium.org>	Wed Jun 25 10:36:46 2014 -0700
committer	Jan Voung <jvoung@chromium.org>	Wed Jun 25 10:36:46 2014 -0700
tree	2a1bb278f15d7280a9564c24dad181d82d013722
parent	1ee34165b98a62ab1b9e5148bd8b03cf0e86741e [diff]