Subzero. ARM32. Refactors atomic intrinsics lowering.

BUG=  https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1409863006 .
diff --git a/crosstest/test_sync_atomic.cpp b/crosstest/test_sync_atomic.cpp
index d1578eb..432002a 100644
--- a/crosstest/test_sync_atomic.cpp
+++ b/crosstest/test_sync_atomic.cpp
@@ -47,7 +47,8 @@
     if (fetch) {                                                               \
       return __sync_fetch_and_##inst(ptr, 42);                                 \
     } else {                                                                   \
-      return __sync_##inst##_and_fetch(ptr, 99);                               \
+      const type value = static_cast<type>(0xaaaaaaaaaaaaaaaaull);             \
+      return __sync_##inst##_and_fetch(ptr, value);                            \
     }                                                                          \
   }
 
diff --git a/pydir/build-runtime.py b/pydir/build-runtime.py
index 2d5cf3d..c4ba6d4 100755
--- a/pydir/build-runtime.py
+++ b/pydir/build-runtime.py
@@ -65,14 +65,27 @@
               '-target=' + target_info.triple,
               '-c',
               '{srcdir}/szrt_profiler.c'.format(srcdir=srcdir),
-              '-o', TmpFile('{dir}/szrt_profiler_native_{target}.o')
+              '-o', TmpFile('{dir}/szrt_native_profiler_{target}.o')
+      ], echo=verbose)
+    # Assemble srcdir/szrt_asm_{target}.s to tempdir/szrt_asm_{target}.o.
+    shellcmd(['llvm-mc',
+              '-triple=' + target_info.triple, '--defsym NATIVE=1',
+              '-filetype=obj',
+              '-o', TmpFile('{dir}/szrt_native_asm_{target}.o'),
+              '{srcdir}/szrt_asm_{target}.s'.format(
+                srcdir=srcdir, target=target_info.target)
       ], echo=verbose)
     # Write full szrt_native_{target}.o.
     PartialLink([TmpFile('{dir}/szrt_native_{target}.tmp.o'),
-                 TmpFile('{dir}/szrt_profiler_native_{target}.o')],
+                 TmpFile('{dir}/szrt_native_asm_{target}.o'),
+                 TmpFile('{dir}/szrt_native_profiler_{target}.o')],
                 ['-m {ld_emu}'.format(ld_emu=target_info.ld_emu)],
                 OutFile('{rtdir}/szrt_native_{target}.o'),
                 verbose)
+    shellcmd(['le32-nacl-objcopy',
+              '--strip-symbol=NATIVE',
+              OutFile('{rtdir}/szrt_native_{target}.o')])
+
   # Helper function for building the sandboxed runtime.
   def MakeSandboxedRuntime():
     """Builds just the sandboxed runtime."""
@@ -82,8 +95,26 @@
     Translate(ll_files,
               ['-mtriple=' + targets.ConvertTripleToNaCl(target_info.triple)] +
               target_info.llc_flags,
-              OutFile('{rtdir}/szrt_sb_{target}.o'),
+              TmpFile('{dir}/szrt_sb_{target}.tmp.o'),
               verbose)
+    # Assemble srcdir/szrt_asm_{target}.s to tempdir/szrt_asm_{target}.o.
+    shellcmd(['llvm-mc',
+              '-triple=' + targets.ConvertTripleToNaCl(target_info.triple),
+              '--defsym NACL=1',
+              '-filetype=obj',
+              '-o', TmpFile('{dir}/szrt_sb_asm_{target}.o'),
+              '{srcdir}/szrt_asm_{target}.s'.format(
+                srcdir=srcdir, target=target_info.target)
+      ], echo=verbose)
+    PartialLink([TmpFile('{dir}/szrt_sb_{target}.tmp.o'),
+                 TmpFile('{dir}/szrt_sb_asm_{target}.o')],
+                ['-m {ld_emu}'.format(ld_emu=target_info.sb_emu)],
+                OutFile('{rtdir}/szrt_sb_{target}.o'),
+                verbose)
+    shellcmd(['le32-nacl-objcopy',
+              '--strip-symbol=NACL',
+              OutFile('{rtdir}/szrt_sb_{target}.o')])
+
   # Helper function for building the Non-SFI runtime.
   def MakeNonsfiRuntime():
     """Builds just the nonsfi runtime."""
@@ -96,18 +127,22 @@
               verbose)
     # Assemble srcdir/szrt_asm_{target}.s to tempdir/szrt_asm_{target}.o.
     shellcmd(['llvm-mc',
-              '-triple=' + target_info.triple,
+              '-triple=' + target_info.triple, '--defsym NONSFI=1',
               '-filetype=obj',
-              '-o', TmpFile('{dir}/szrt_asm_{target}.o'),
+              '-o', TmpFile('{dir}/szrt_nonsfi_asm_{target}.o'),
               '{srcdir}/szrt_asm_{target}.s'.format(
                 srcdir=srcdir, target=target_info.target)
       ], echo=verbose)
     # Write full szrt_nonsfi_{target}.o.
     PartialLink([TmpFile('{dir}/szrt_nonsfi_{target}.tmp.o'),
-                 TmpFile('{dir}/szrt_asm_{target}.o')],
+                 TmpFile('{dir}/szrt_nonsfi_asm_{target}.o')],
                 ['-m {ld_emu}'.format(ld_emu=target_info.ld_emu)],
                 OutFile('{rtdir}/szrt_nonsfi_{target}.o'),
                 verbose)
+    shellcmd(['le32-nacl-objcopy',
+              '--strip-symbol=NONSFI',
+              OutFile('{rtdir}/szrt_nonsfi_{target}.o')])
+
 
   # Run the helper functions.
   MakeNativeRuntime()
diff --git a/pydir/targets.py b/pydir/targets.py
index 5effd4e..ea51c28 100644
--- a/pydir/targets.py
+++ b/pydir/targets.py
@@ -18,13 +18,14 @@
 
 TargetInfo = namedtuple('TargetInfo',
                         ['target', 'compiler_arch', 'triple', 'llc_flags',
-                         'ld_emu', 'cross_headers'])
+                         'ld_emu', 'sb_emu', 'cross_headers'])
 
 X8632Target = TargetInfo(target='x8632',
                          compiler_arch='x8632',
                          triple='i686-none-linux',
                          llc_flags=['-mcpu=pentium4m'],
                          ld_emu='elf_i386_nacl',
+                         sb_emu='elf_i386_nacl',
                          cross_headers=[])
 
 X8664Target = TargetInfo(target='x8664',
@@ -32,6 +33,7 @@
                          triple='x86_64-none-linux-gnux32',
                          llc_flags=['-mcpu=x86-64'],
                          ld_emu='elf32_x86_64_nacl',
+                         sb_emu='elf_x86_64_nacl',
                          cross_headers=[])
 
 ARM32Target = TargetInfo(target='arm32',
@@ -41,6 +43,7 @@
                                     '-float-abi=hard',
                                     '-mattr=+neon'],
                          ld_emu='armelf_nacl',
+                         sb_emu='armelf_nacl',
                          cross_headers=['-isystem', FindARMCrossInclude()])
 
 def ConvertTripleToNaCl(nonsfi_triple):
diff --git a/runtime/szrt_asm_arm32.s b/runtime/szrt_asm_arm32.s
index 54cb380..1d01909 100644
--- a/runtime/szrt_asm_arm32.s
+++ b/runtime/szrt_asm_arm32.s
@@ -14,6 +14,3 @@
 
 	.text
 	.p2alignl 4,0xE7FEDEF0
-	.globl	__nacl_read_tp
-__nacl_read_tp:
-	b	__aeabi_read_tp
diff --git a/runtime/szrt_asm_x8632.s b/runtime/szrt_asm_x8632.s
index 518acef..d2387cd 100644
--- a/runtime/szrt_asm_x8632.s
+++ b/runtime/szrt_asm_x8632.s
@@ -15,6 +15,7 @@
 	.text
 	.p2align 5,0xf4
 
+	.ifdef NONSFI
 	.globl __Sz_getIP_eax
 __Sz_getIP_eax:
 	movl (%esp), %eax
@@ -49,3 +50,4 @@
 __Sz_getIP_edi:
 	movl (%esp), %edi
 	ret
+	.endif  # NONSFI
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 993dc44..11a4a1b 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -642,7 +642,7 @@
       *RegNumBVIter(Filtered ? Iter.Free : Iter.FreeUnfiltered).begin();
   Iter.Cur->setRegNumTmp(RegNum);
   if (Filtered)
-    dumpLiveRangeTrace("Allocating   ", Iter.Cur);
+    dumpLiveRangeTrace("Allocating Y ", Iter.Cur);
   else
     dumpLiveRangeTrace("Allocating X ", Iter.Cur);
   const llvm::SmallBitVector &Aliases = *RegAliases[RegNum];
@@ -768,7 +768,7 @@
     ++RegUses[RegAlias];
   }
   Active.push_back(Iter.Cur);
-  dumpLiveRangeTrace("Allocating   ", Iter.Cur);
+  dumpLiveRangeTrace("Allocating Z ", Iter.Cur);
 }
 
 void LinearScan::assignFinalRegisters(
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 85e0a65..b3e43eb 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -385,7 +385,8 @@
       // This is not the variable we are looking for.
       continue;
     }
-    assert(Var64->hasReg() || !Var64->mustHaveReg());
+    // only allow infinite-weight i64 temporaries to be register allocated.
+    assert(!Var64->hasReg() || Var64->mustHaveReg());
     if (!Var64->hasReg()) {
       continue;
     }
@@ -4401,10 +4402,16 @@
 }
 
 TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(const InstIcmp *Instr) {
-  Operand *Src0 = legalizeUndef(Instr->getSrc(0));
-  Operand *Src1 = legalizeUndef(Instr->getSrc(1));
+  return lowerIcmpCond(Instr->getCondition(), Instr->getSrc(0),
+                       Instr->getSrc(1));
+}
 
-  const InstIcmp::ICond Condition = Instr->getCondition();
+TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(InstIcmp::ICond Condition,
+                                                     Operand *Src0,
+                                                     Operand *Src1) {
+  Src0 = legalizeUndef(Src0);
+  Src1 = legalizeUndef(Src1);
+
   // a=icmp cond b, c ==>
   // GCC does:
   //   <u/s>xtb tb, b
@@ -4504,162 +4511,156 @@
 }
 } // end of anonymous namespace
 
+void TargetARM32::lowerLoadLinkedStoreExclusive(
+    Type Ty, Operand *Addr, std::function<Variable *(Variable *)> Operation,
+    CondARM32::Cond Cond) {
+
+  auto *Retry = Context.insert<InstARM32Label>(this);
+  { // scoping for loop highlighting.
+    Variable *Tmp = (Ty == IceType_i64) ? makeI64RegPair() : makeReg(Ty);
+    auto *Success = makeReg(IceType_i32);
+    auto *_0 = Ctx->getConstantZero(IceType_i32);
+
+    Context.insert<InstFakeDef>(Tmp);
+    Context.insert<InstFakeUse>(Tmp);
+    Variable *AddrR = legalizeToReg(Addr);
+    _ldrex(Tmp, formMemoryOperand(AddrR, Ty))->setDestRedefined();
+    auto *StoreValue = Operation(Tmp);
+    assert(StoreValue->mustHaveReg());
+    _strex(Success, StoreValue, formMemoryOperand(AddrR, Ty), Cond);
+    _cmp(Success, _0, Cond);
+  }
+  _br(Retry, CondARM32::NE);
+}
+
+namespace {
+InstArithmetic *createArithInst(Cfg *Func, uint32_t Operation, Variable *Dest,
+                                Variable *Src0, Operand *Src1) {
+  InstArithmetic::OpKind Oper;
+  switch (Operation) {
+  default:
+    llvm::report_fatal_error("Unknown AtomicRMW operation");
+  case Intrinsics::AtomicExchange:
+    llvm::report_fatal_error("Can't handle Atomic xchg operation");
+  case Intrinsics::AtomicAdd:
+    Oper = InstArithmetic::Add;
+    break;
+  case Intrinsics::AtomicAnd:
+    Oper = InstArithmetic::And;
+    break;
+  case Intrinsics::AtomicSub:
+    Oper = InstArithmetic::Sub;
+    break;
+  case Intrinsics::AtomicOr:
+    Oper = InstArithmetic::Or;
+    break;
+  case Intrinsics::AtomicXor:
+    Oper = InstArithmetic::Xor;
+    break;
+  }
+  return InstArithmetic::create(Func, Oper, Dest, Src0, Src1);
+}
+} // end of anonymous namespace
+
 void TargetARM32::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
-                                 Operand *Ptr, Operand *Val) {
+                                 Operand *Addr, Operand *Val) {
   // retry:
-  //     ldrex contents, [addr]
-  //     op tmp, contents, operand
-  //     strex success, tmp, [addr]
+  //     ldrex tmp, [addr]
+  //     mov contents, tmp
+  //     op result, contents, Val
+  //     strex success, result, [addr]
+  //     cmp success, 0
   //     jne retry
   //     fake-use(addr, operand)  @ prevents undesirable clobbering.
   //     mov dest, contents
-  assert(Dest != nullptr);
-  Type DestTy = Dest->getType();
-  (void)Ptr;
-  (void)Val;
-
-  OperandARM32Mem *Mem;
-  Variable *PtrContentsReg;
-  Variable *PtrContentsHiReg;
-  Variable *PtrContentsLoReg;
-  Variable *Value = Func->makeVariable(DestTy);
-  Variable *ValueReg;
-  Variable *ValueHiReg;
-  Variable *ValueLoReg;
-  Variable *Success = makeReg(IceType_i32);
-  Variable *TmpReg;
-  Variable *TmpHiReg;
-  Variable *TmpLoReg;
-  Operand *_0 = Ctx->getConstantZero(IceType_i32);
-  auto *Retry = InstARM32Label::create(Func, this);
+  auto DestTy = Dest->getType();
 
   if (DestTy == IceType_i64) {
-    Variable64On32 *PtrContentsReg64 = makeI64RegPair();
-    PtrContentsHiReg = PtrContentsReg64->getHi();
-    PtrContentsLoReg = PtrContentsReg64->getLo();
-    PtrContentsReg = PtrContentsReg64;
-
-    llvm::cast<Variable64On32>(Value)->initHiLo(Func);
-    Variable64On32 *ValueReg64 = makeI64RegPair();
-    ValueHiReg = ValueReg64->getHi();
-    ValueLoReg = ValueReg64->getLo();
-    ValueReg = ValueReg64;
-
-    Variable64On32 *TmpReg64 = makeI64RegPair();
-    TmpHiReg = TmpReg64->getHi();
-    TmpLoReg = TmpReg64->getLo();
-    TmpReg = TmpReg64;
-  } else {
-    PtrContentsReg = makeReg(DestTy);
-    PtrContentsHiReg = nullptr;
-    PtrContentsLoReg = PtrContentsReg;
-
-    ValueReg = makeReg(DestTy);
-    ValueHiReg = nullptr;
-    ValueLoReg = ValueReg;
-
-    TmpReg = makeReg(DestTy);
-    TmpHiReg = nullptr;
-    TmpLoReg = TmpReg;
-  }
-
-  if (DestTy == IceType_i64) {
-    Context.insert<InstFakeDef>(Value);
-  }
-  lowerAssign(InstAssign::create(Func, Value, Val));
-
-  Variable *PtrVar = Func->makeVariable(IceType_i32);
-  lowerAssign(InstAssign::create(Func, PtrVar, Ptr));
-
-  _dmb();
-  Context.insert(Retry);
-  Mem = formMemoryOperand(PtrVar, DestTy);
-  if (DestTy == IceType_i64) {
-    Context.insert<InstFakeDef>(ValueReg, Value);
-  }
-  lowerAssign(InstAssign::create(Func, ValueReg, Value));
-  if (DestTy == IceType_i8 || DestTy == IceType_i16) {
-    _uxt(ValueReg, ValueReg);
-  }
-  _ldrex(PtrContentsReg, Mem);
-
-  if (DestTy == IceType_i64) {
-    Context.insert<InstFakeDef>(TmpReg, ValueReg);
-  }
-  switch (Operation) {
-  default:
-    Func->setError("Unknown AtomicRMW operation");
+    lowerInt64AtomicRMW(Dest, Operation, Addr, Val);
     return;
-  case Intrinsics::AtomicAdd:
-    if (DestTy == IceType_i64) {
-      _adds(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-      _adc(TmpHiReg, PtrContentsHiReg, ValueHiReg);
-    } else {
-      _add(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-    }
-    break;
-  case Intrinsics::AtomicSub:
-    if (DestTy == IceType_i64) {
-      _subs(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-      _sbc(TmpHiReg, PtrContentsHiReg, ValueHiReg);
-    } else {
-      _sub(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-    }
-    break;
-  case Intrinsics::AtomicOr:
-    _orr(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-    if (DestTy == IceType_i64) {
-      _orr(TmpHiReg, PtrContentsHiReg, ValueHiReg);
-    }
-    break;
-  case Intrinsics::AtomicAnd:
-    _and(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-    if (DestTy == IceType_i64) {
-      _and(TmpHiReg, PtrContentsHiReg, ValueHiReg);
-    }
-    break;
-  case Intrinsics::AtomicXor:
-    _eor(TmpLoReg, PtrContentsLoReg, ValueLoReg);
-    if (DestTy == IceType_i64) {
-      _eor(TmpHiReg, PtrContentsHiReg, ValueHiReg);
-    }
-    break;
-  case Intrinsics::AtomicExchange:
-    _mov(TmpLoReg, ValueLoReg);
-    if (DestTy == IceType_i64) {
-      _mov(TmpHiReg, ValueHiReg);
-    }
-    break;
   }
-  _strex(Success, TmpReg, Mem);
-  _cmp(Success, _0);
-  _br(Retry, CondARM32::NE);
 
-  // The following fake-uses ensure that Subzero will not clobber them in the
-  // load-linked/store-conditional loop above. We might have to spill them, but
-  // spilling is preferable over incorrect behavior.
-  Context.insert<InstFakeUse>(PtrVar);
-  if (auto *Value64 = llvm::dyn_cast<Variable64On32>(Value)) {
-    Context.insert<InstFakeUse>(Value64->getHi());
-    Context.insert<InstFakeUse>(Value64->getLo());
+  Operand *ValRF = nullptr;
+  if (llvm::isa<ConstantInteger32>(Val)) {
+    ValRF = Val;
   } else {
-    Context.insert<InstFakeUse>(Value);
+    ValRF = legalizeToReg(Val);
   }
+  auto *ContentsR = makeReg(DestTy);
+  auto *ResultR = makeReg(DestTy);
+
   _dmb();
-  if (DestTy == IceType_i8 || DestTy == IceType_i16) {
-    _uxt(PtrContentsReg, PtrContentsReg);
+  lowerLoadLinkedStoreExclusive(
+      DestTy, Addr,
+      [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) {
+        lowerAssign(InstAssign::create(Func, ContentsR, Tmp));
+        if (Operation == Intrinsics::AtomicExchange) {
+          lowerAssign(InstAssign::create(Func, ResultR, ValRF));
+        } else {
+          lowerArithmetic(
+              createArithInst(Func, Operation, ResultR, ContentsR, ValRF));
+        }
+        return ResultR;
+      });
+  _dmb();
+  if (auto *ValR = llvm::dyn_cast<Variable>(ValRF)) {
+    Context.insert<InstFakeUse>(ValR);
+  }
+  // Can't dce ContentsR.
+  Context.insert<InstFakeUse>(ContentsR);
+  lowerAssign(InstAssign::create(Func, Dest, ContentsR));
+}
+
+void TargetARM32::lowerInt64AtomicRMW(Variable *Dest, uint32_t Operation,
+                                      Operand *Addr, Operand *Val) {
+  assert(Dest->getType() == IceType_i64);
+
+  auto *ResultR = makeI64RegPair();
+
+  Context.insert<InstFakeDef>(ResultR);
+
+  Operand *ValRF = nullptr;
+  if (llvm::dyn_cast<ConstantInteger64>(Val)) {
+    ValRF = Val;
+  } else {
+    auto *ValR64 = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
+    ValR64->initHiLo(Func);
+    ValR64->setMustNotHaveReg();
+    ValR64->getLo()->setMustHaveReg();
+    ValR64->getHi()->setMustHaveReg();
+    lowerAssign(InstAssign::create(Func, ValR64, Val));
+    ValRF = ValR64;
   }
 
-  if (DestTy == IceType_i64) {
-    Context.insert<InstFakeUse>(PtrContentsReg);
+  auto *ContentsR = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
+  ContentsR->initHiLo(Func);
+  ContentsR->setMustNotHaveReg();
+  ContentsR->getLo()->setMustHaveReg();
+  ContentsR->getHi()->setMustHaveReg();
+
+  _dmb();
+  lowerLoadLinkedStoreExclusive(
+      IceType_i64, Addr,
+      [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) {
+        lowerAssign(InstAssign::create(Func, ContentsR, Tmp));
+        Context.insert<InstFakeUse>(Tmp);
+        if (Operation == Intrinsics::AtomicExchange) {
+          lowerAssign(InstAssign::create(Func, ResultR, ValRF));
+        } else {
+          lowerArithmetic(
+              createArithInst(Func, Operation, ResultR, ContentsR, ValRF));
+        }
+        Context.insert<InstFakeUse>(ResultR->getHi());
+        Context.insert<InstFakeDef>(ResultR, ResultR->getLo())
+            ->setDestRedefined();
+        return ResultR;
+      });
+  _dmb();
+  if (auto *ValR64 = llvm::dyn_cast<Variable64On32>(ValRF)) {
+    Context.insert<InstFakeUse>(ValR64->getLo());
+    Context.insert<InstFakeUse>(ValR64->getHi());
   }
-  lowerAssign(InstAssign::create(Func, Dest, PtrContentsReg));
-  if (auto *Dest64 = llvm::dyn_cast<Variable64On32>(Dest)) {
-    Context.insert<InstFakeUse>(Dest64->getLo());
-    Context.insert<InstFakeUse>(Dest64->getHi());
-  } else {
-    Context.insert<InstFakeUse>(Dest);
-  }
+  lowerAssign(InstAssign::create(Func, Dest, ContentsR));
 }
 
 void TargetARM32::postambleCtpop64(const InstCall *Instr) {
@@ -4733,10 +4734,9 @@
     }
     _dmb();
     lowerAssign(InstAssign::create(Func, Dest, T));
-    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
-    // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
-    // the FakeUse on the last-inserted instruction's dest.
-    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
+    // Adding a fake-use T to ensure the atomic load is not removed if Dest is
+    // unused.
+    Context.insert<InstFakeUse>(T);
     return;
   }
   case Intrinsics::AtomicStore: {
@@ -4747,105 +4747,48 @@
       Func->setError("Unexpected memory ordering for AtomicStore");
       return;
     }
-    Operand *Value = Instr->getArg(0);
-    Type ValueTy = Value->getType();
-    assert(isScalarIntegerType(ValueTy));
-    Operand *Addr = Instr->getArg(1);
 
-    if (ValueTy == IceType_i64) {
-      // Atomic 64-bit stores require a load-locked/store-conditional loop using
-      // ldrexd, and strexd. The lowered code is:
-      //
-      // retry:
-      //     ldrexd t.lo, t.hi, [addr]
-      //     strexd success, value.lo, value.hi, [addr]
-      //     cmp success, #0
-      //     bne retry
-      //     fake-use(addr, value.lo, value.hi)
-      //
-      // The fake-use is needed to prevent those variables from being clobbered
-      // in the loop (which will happen under register pressure.)
-      Variable64On32 *Tmp = makeI64RegPair();
-      Variable64On32 *ValueVar =
-          llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
-      Variable *AddrVar = makeReg(IceType_i32);
-      Variable *Success = makeReg(IceType_i32);
-      OperandARM32Mem *Mem;
-      Operand *_0 = Ctx->getConstantZero(IceType_i32);
-      auto *Retry = InstARM32Label::create(Func, this);
-      Variable64On32 *NewReg = makeI64RegPair();
-      ValueVar->initHiLo(Func);
-      ValueVar->mustNotHaveReg();
-
+    auto *Value = Instr->getArg(0);
+    if (Value->getType() == IceType_i64) {
+      auto *ValueR = makeI64RegPair();
+      Context.insert<InstFakeDef>(ValueR);
+      lowerAssign(InstAssign::create(Func, ValueR, Value));
       _dmb();
-      lowerAssign(InstAssign::create(Func, ValueVar, Value));
-      lowerAssign(InstAssign::create(Func, AddrVar, Addr));
-
-      Context.insert(Retry);
-      Context.insert<InstFakeDef>(NewReg);
-      lowerAssign(InstAssign::create(Func, NewReg, ValueVar));
-      Mem = formMemoryOperand(AddrVar, IceType_i64);
-      _ldrex(Tmp, Mem);
-      // This fake-use both prevents the ldrex from being dead-code eliminated,
-      // while also keeping liveness happy about all defs being used.
-      Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
-      _strex(Success, NewReg, Mem);
-      _cmp(Success, _0);
-      _br(Retry, CondARM32::NE);
-
-      Context.insert<InstFakeUse>(ValueVar->getLo());
-      Context.insert<InstFakeUse>(ValueVar->getHi());
-      Context.insert<InstFakeUse>(AddrVar);
+      lowerLoadLinkedStoreExclusive(
+          IceType_i64, Instr->getArg(1), [this, ValueR](Variable *Tmp) {
+            // The following fake-use prevents the ldrex instruction from being
+            // dead code eliminated.
+            Context.insert<InstFakeUse>(llvm::cast<Variable>(loOperand(Tmp)));
+            Context.insert<InstFakeUse>(llvm::cast<Variable>(hiOperand(Tmp)));
+            Context.insert<InstFakeUse>(Tmp);
+            return ValueR;
+          });
+      Context.insert<InstFakeUse>(ValueR);
       _dmb();
       return;
     }
+
+    auto *ValueR = legalizeToReg(Instr->getArg(0));
+    const auto ValueTy = ValueR->getType();
+    assert(isScalarIntegerType(ValueTy));
+    auto *Addr = legalizeToReg(Instr->getArg(1));
+
     // non-64-bit stores are atomically as long as the address is aligned. This
     // is PNaCl, so addresses are aligned.
-    Variable *T = makeReg(ValueTy);
-
     _dmb();
-    lowerAssign(InstAssign::create(Func, T, Value));
-    _str(T, formMemoryOperand(Addr, ValueTy));
+    _str(ValueR, formMemoryOperand(Addr, ValueTy));
     _dmb();
     return;
   }
   case Intrinsics::AtomicCmpxchg: {
-    // The initial lowering for cmpxchg was:
-    //
     // retry:
     //     ldrex tmp, [addr]
     //     cmp tmp, expected
     //     mov expected, tmp
-    //     jne retry
-    //     strex success, new, [addr]
-    //     cmp success, #0
-    //     bne retry
-    //     mov dest, expected
-    //
-    // Besides requiring two branches, that lowering could also potentially
-    // write to memory (in mov expected, tmp) unless we were OK with increasing
-    // the register pressure and requiring expected to be an infinite-weight
-    // variable (spoiler alert: that was a problem for i64 cmpxchg.) Through
-    // careful rewritting, and thanks to predication, we now implement the
-    // lowering as:
-    //
-    // retry:
-    //     ldrex tmp, [addr]
-    //     cmp tmp, expected
     //     strexeq success, new, [addr]
-    //     movne expected, tmp
     //     cmpeq success, #0
     //     bne retry
     //     mov dest, expected
-    //
-    // Predication lets us move the strex ahead of the mov expected, tmp, which
-    // allows tmp to be a non-infinite weight temporary. We wanted to avoid
-    // writing to memory between ldrex and strex because, even though most times
-    // that would cause no issues, if any interleaving memory write aliased
-    // [addr] than we would have undefined behavior. Undefined behavior isn't
-    // cool, so we try to avoid it. See the "Synchronization and semaphores"
-    // section of the "ARM Architecture Reference Manual."
-
     assert(isScalarIntegerType(DestTy));
     // We require the memory address to be naturally aligned. Given that is the
     // case, then normal loads are atomic.
@@ -4856,98 +4799,63 @@
       return;
     }
 
-    OperandARM32Mem *Mem;
-    Variable *TmpReg;
-    Variable *Expected, *ExpectedReg;
-    Variable *New, *NewReg;
-    Variable *Success = makeReg(IceType_i32);
-    Operand *_0 = Ctx->getConstantZero(IceType_i32);
-    auto *Retry = InstARM32Label::create(Func, this);
-
     if (DestTy == IceType_i64) {
-      Variable64On32 *TmpReg64 = makeI64RegPair();
-      Variable64On32 *New64 =
-          llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
-      Variable64On32 *NewReg64 = makeI64RegPair();
-      Variable64On32 *Expected64 =
-          llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
-      Variable64On32 *ExpectedReg64 = makeI64RegPair();
-
-      New64->initHiLo(Func);
-      New64->mustNotHaveReg();
-      Expected64->initHiLo(Func);
-      Expected64->mustNotHaveReg();
-
-      TmpReg = TmpReg64;
-      New = New64;
-      NewReg = NewReg64;
-      Expected = Expected64;
-      ExpectedReg = ExpectedReg64;
-    } else {
-      TmpReg = makeReg(DestTy);
-      New = Func->makeVariable(DestTy);
-      NewReg = makeReg(DestTy);
-      Expected = Func->makeVariable(DestTy);
-      ExpectedReg = makeReg(DestTy);
-    }
-
-    Mem = formMemoryOperand(Instr->getArg(0), DestTy);
-    if (DestTy == IceType_i64) {
-      Context.insert<InstFakeDef>(Expected);
-    }
-    lowerAssign(InstAssign::create(Func, Expected, Instr->getArg(1)));
-    if (DestTy == IceType_i64) {
+      auto *New = makeI64RegPair();
       Context.insert<InstFakeDef>(New);
+      lowerAssign(InstAssign::create(Func, New, Instr->getArg(2)));
+
+      auto *Expected = makeI64RegPair();
+      Context.insert<InstFakeDef>(Expected);
+      lowerAssign(InstAssign::create(Func, Expected, Instr->getArg(1)));
+
+      _dmb();
+      lowerLoadLinkedStoreExclusive(
+          DestTy, Instr->getArg(0),
+          [this, Expected, New, Instr, DestTy](Variable *Tmp) {
+            auto *ExpectedLoR = llvm::cast<Variable>(loOperand(Expected));
+            auto *ExpectedHiR = llvm::cast<Variable>(hiOperand(Expected));
+            auto *TmpLoR = llvm::cast<Variable>(loOperand(Tmp));
+            auto *TmpHiR = llvm::cast<Variable>(hiOperand(Tmp));
+            _cmp(TmpLoR, ExpectedLoR);
+            _cmp(TmpHiR, ExpectedHiR, CondARM32::EQ);
+            // Adding an explicit use of Tmp here, or its live range will not
+            // reach here (only those of Tmp.Lo and Tmp.Hi will.)
+            Context.insert<InstFakeUse>(Tmp);
+            _mov_redefined(ExpectedLoR, TmpLoR);
+            _mov_redefined(ExpectedHiR, TmpHiR);
+            // Same as above.
+            Context.insert<InstFakeUse>(Tmp);
+            return New;
+          },
+          CondARM32::EQ);
+      _dmb();
+
+      lowerAssign(InstAssign::create(Func, Dest, Expected));
+      // The fake-use Expected prevents the assignments to Expected (above)
+      // from being removed if Dest is not used.
+      Context.insert<InstFakeUse>(Expected);
+      // New needs to be alive here, or its live range will end in the
+      // strex instruction.
+      Context.insert<InstFakeUse>(New);
+      return;
     }
-    lowerAssign(InstAssign::create(Func, New, Instr->getArg(2)));
+
+    auto *New = legalizeToReg(Instr->getArg(2));
+    auto *Expected = legalizeToReg(Instr->getArg(1));
+
+    _dmb();
+    lowerLoadLinkedStoreExclusive(
+        DestTy,
+        Instr->getArg(0), [this, Expected, New, Instr, DestTy](Variable *Tmp) {
+          lowerIcmpCond(InstIcmp::Eq, Tmp, Expected);
+          _mov_redefined(Expected, Tmp);
+          return New;
+        }, CondARM32::EQ);
     _dmb();
 
-    Context.insert(Retry);
-    if (DestTy == IceType_i64) {
-      Context.insert<InstFakeDef>(ExpectedReg, Expected);
-    }
-    lowerAssign(InstAssign::create(Func, ExpectedReg, Expected));
-    if (DestTy == IceType_i64) {
-      Context.insert<InstFakeDef>(NewReg, New);
-    }
-    lowerAssign(InstAssign::create(Func, NewReg, New));
-
-    _ldrex(TmpReg, Mem);
-    Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
-    if (DestTy == IceType_i64) {
-      auto *TmpReg64 = llvm::cast<Variable64On32>(TmpReg);
-      auto *ExpectedReg64 = llvm::cast<Variable64On32>(ExpectedReg);
-      // lowerAssign above has added fake-defs for TmpReg and ExpectedReg. Let's
-      // keep liveness happy, shall we?
-      Context.insert<InstFakeUse>(TmpReg);
-      Context.insert<InstFakeUse>(ExpectedReg);
-      _cmp(TmpReg64->getHi(), ExpectedReg64->getHi());
-      _cmp(TmpReg64->getLo(), ExpectedReg64->getLo(), CondARM32::EQ);
-    } else {
-      _cmp(TmpReg, ExpectedReg);
-    }
-    _strex(Success, NewReg, Mem, CondARM32::EQ);
-    if (DestTy == IceType_i64) {
-      auto *TmpReg64 = llvm::cast<Variable64On32>(TmpReg);
-      auto *Expected64 = llvm::cast<Variable64On32>(Expected);
-      _mov_redefined(Expected64->getHi(), TmpReg64->getHi(), CondARM32::NE);
-      _mov_redefined(Expected64->getLo(), TmpReg64->getLo(), CondARM32::NE);
-      Context.insert<InstFakeDef>(Expected, TmpReg);
-      _set_dest_redefined();
-    } else {
-      _mov_redefined(Expected, TmpReg, CondARM32::NE);
-    }
-    _cmp(Success, _0, CondARM32::EQ);
-    _br(Retry, CondARM32::NE);
-    _dmb();
     lowerAssign(InstAssign::create(Func, Dest, Expected));
     Context.insert<InstFakeUse>(Expected);
-    if (auto *New64 = llvm::dyn_cast<Variable64On32>(New)) {
-      Context.insert<InstFakeUse>(New64->getLo());
-      Context.insert<InstFakeUse>(New64->getHi());
-    } else {
-      Context.insert<InstFakeUse>(New);
-    }
+    Context.insert<InstFakeUse>(New);
     return;
   }
   case Intrinsics::AtomicRMW: {
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index e457127..83e3c58 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -246,8 +246,29 @@
                                   Operand *Src1);
   CondWhenTrue lowerInt64IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
                                   Operand *Src1);
+  CondWhenTrue lowerIcmpCond(InstIcmp::ICond Condition, Operand *Src0,
+                             Operand *Src1);
   CondWhenTrue lowerIcmpCond(const InstIcmp *Instr);
   void lowerIcmp(const InstIcmp *Instr) override;
+  /// Emits the basic sequence for lower-linked/store-exclusive loops:
+  ///
+  /// retry:
+  ///        ldrex tmp, [Addr]
+  ///        StoreValue = Operation(tmp)
+  ///        strexCond success, StoreValue, [Addr]
+  ///        cmpCond success, #0
+  ///        bne retry
+  ///
+  /// Operation needs to return which value to strex in Addr, it must not change
+  /// the flags if Cond is not AL, and must not emit any instructions that could
+  /// end up writing to memory. Operation also needs to handle fake-defing for
+  /// i64 handling.
+  void
+  lowerLoadLinkedStoreExclusive(Type Ty, Operand *Addr,
+                                std::function<Variable *(Variable *)> Operation,
+                                CondARM32::Cond Cond = CondARM32::AL);
+  void lowerInt64AtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
+                           Operand *Val);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
   void lowerIntrinsicCall(const InstIntrinsicCall *Instr) override;
@@ -360,13 +381,14 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Ldr>(Dest, Addr, Pred);
   }
-  void _ldrex(Variable *Dest, OperandARM32Mem *Addr,
-              CondARM32::Cond Pred = CondARM32::AL) {
-    Context.insert<InstARM32Ldrex>(Dest, Addr, Pred);
+  InstARM32Ldrex *_ldrex(Variable *Dest, OperandARM32Mem *Addr,
+                         CondARM32::Cond Pred = CondARM32::AL) {
+    auto *Ldrex = Context.insert<InstARM32Ldrex>(Dest, Addr, Pred);
     if (auto *Dest64 = llvm::dyn_cast<Variable64On32>(Dest)) {
       Context.insert<InstFakeDef>(Dest64->getLo(), Dest);
       Context.insert<InstFakeDef>(Dest64->getHi(), Dest);
     }
+    return Ldrex;
   }
   void _lsl(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
diff --git a/tests_lit/assembler/arm32/ldrex-strex.ll b/tests_lit/assembler/arm32/ldrex-strex.ll
index b999801..47e99f3 100644
--- a/tests_lit/assembler/arm32/ldrex-strex.ll
+++ b/tests_lit/assembler/arm32/ldrex-strex.ll
@@ -28,140 +28,38 @@
 
 define internal i32 @testI8Form(i32 %ptr, i32 %a) {
 ; ASM-LABEL:testI8Form:
-; DIS-LABEL:00000000 <testI8Form>:
+; DIS-LABEL:<testI8Form>:
 ; IASM-LABEL:testI8Form:
 
 entry:
-; ASM-NEXT:.LtestI8Form$entry:
-; IASM-NEXT:.LtestI8Form$entry:
-
-; ASM-NEXT:     sub     sp, sp, #28
-; DIS-NEXT:   0:        e24dd01c
-; IASM-NEXT:    .byte 0x1c
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
-
-; ASM-NEXT:     str     r0, [sp, #24]
-; ASM-NEXT:     # [sp, #24] = def.pseudo
-; DIS-NEXT:   4:        e58d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r1, [sp, #20]
-; ASM-NEXT:     # [sp, #20] = def.pseudo
-; DIS-NEXT:   8:        e58d1014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
   %ptr.asptr = inttoptr i32 %ptr to i8*
   %a.arg_trunc = trunc i32 %a to i8
 
-; ASM-NEXT:     ldr     r0, [sp, #20]
-; DIS-NEXT:   c:        e59d0014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     strb    r0, [sp, #16]
-; DIS-NEXT:  10:        e5cd0010
-; ASM-NEXT:     # [sp, #16] = def.pseudo
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xcd
-; IASM-NEXT:    .byte 0xe5
-
   %v = call i8 @llvm.nacl.atomic.rmw.i8(i32 1, i8* %ptr.asptr,
                                         i8 %a.arg_trunc, i32 6)
 
-; ASM-NEXT:     ldrb    r0, [sp, #16]
-; DIS-NEXT:  14:        e5dd0010
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xdd
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     strb    r0, [sp, #4]
-; ASM-NEXT:     # [sp, #4] = def.pseudo
-; DIS-NEXT:  18:        e5cd0004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xcd
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r0, [sp, #24]
-; DIS-NEXT:  1c:        e59d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp]
-; ASM-NEXT:     # [sp] = def.pseudo
-; DIS-NEXT:  20:        e58d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     dmb     sy
-; DIS-NEXT:  24:        f57ff05f
-; IASM-NEXT:    .byte 0x5f
+; ****** Example of dmb *******
+; ASM:          dmb     sy
+; DIS:     1c:  f57ff05f
+; IASM:         .byte 0x5f
 ; IASM-NEXT:    .byte 0xf0
 ; IASM-NEXT:    .byte 0x7f
 ; IASM-NEXT:    .byte 0xf5
 
-; ASM-NEXT:.LtestI8Form$local$__0:
-; IASM-NEXT:.LtestI8Form$local$__0:
-
-; ASM-NEXT:     ldr     r0, [sp]
-; DIS-NEXT:  28:        e59d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldrb    r1, [sp, #4]
-; DIS-NEXT:  2c:        e5dd1004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0xdd
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     uxtb    r1, r1
-; DIS-NEXT:  30:        e6ef1071
-; IASM-NEXT:    .byte 0x71
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0xef
-; IASM-NEXT:    .byte 0xe6
-
 ; ***** Example of ldrexb *****
-; ASM-NEXT:     ldrexb  r2, [r0]
-; DIS-NEXT:  34:        e1d02f9f
-; IASM-NEXT:    .byte 0x9f
-; IASM-NEXT:    .byte 0x2f
-; IASM-NEXT:    .byte 0xd0
+; ASM:          ldrexb  r1, [r2]
+; DIS:     24:  e1d21f9f
+; IASM:         .byte 0x9f
+; IASM-NEXT:    .byte 0x1f
+; IASM-NEXT:    .byte 0xd2
 ; IASM-NEXT:    .byte 0xe1
 
-; ASM-NEXT:     add     r1, r2, r1
-; ASM-NEXT:     # r3 = def.pseudo
-; DIS-NEXT:  38:        e0821001
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x82
-; IASM-NEXT:    .byte 0xe0
-
 ; ***** Example of strexb *****
-; ASM-NEXT:     strexb  r3, r1, [r0]
-; DIS-NEXT:  3c:        e1c03f91
-; IASM-NEXT:    .byte 0x91
-; IASM-NEXT:    .byte 0x3f
-; IASM-NEXT:    .byte 0xc0
+; ASM:          strexb  r4, r3, [r2]
+; DIS:     2c:  e1c24f93
+; IASM:         .byte 0x93
+; IASM-NEXT:    .byte 0x4f
+; IASM-NEXT:    .byte 0xc2
 ; IASM-NEXT:    .byte 0xe1
 
   %retval = zext i8 %v to i32
@@ -170,140 +68,29 @@
 
 define internal i32 @testI16Form(i32 %ptr, i32 %a) {
 ; ASM-LABEL:testI16Form:
-; DIS-LABEL:00000070 <testI16Form>:
+; DIS-LABEL:<testI16Form>:
 ; IASM-LABEL:testI16Form:
 
 entry:
-; ASM-NEXT:.LtestI16Form$entry:
-; IASM-NEXT:.LtestI16Form$entry:
-
-; ASM-NEXT:     sub     sp, sp, #28
-; DIS-NEXT:  70:        e24dd01c
-; IASM-NEXT:    .byte 0x1c
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
-
-; ASM-NEXT:     str     r0, [sp, #24]
-; ASM-NEXT:     # [sp, #24] = def.pseudo
-; DIS-NEXT:  74:        e58d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r1, [sp, #20]
-; ASM-NEXT:     # [sp, #20] = def.pseudo
-; DIS-NEXT:  78:        e58d1014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
   %ptr.asptr = inttoptr i32 %ptr to i16*
   %a.arg_trunc = trunc i32 %a to i16
 
-; ASM-NEXT:     ldr     r0, [sp, #20]
-; DIS-NEXT:  7c:        e59d0014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     strh    r0, [sp, #16]
-; ASM-NEXT:     # [sp, #16] = def.pseudo
-; DIS-NEXT:  80:        e1cd01b0
-; IASM-NEXT:    .byte 0xb0
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0xcd
-; IASM-NEXT:    .byte 0xe1
-
   %v = call i16 @llvm.nacl.atomic.rmw.i16(i32 1, i16* %ptr.asptr,
                                           i16 %a.arg_trunc, i32 6)
-
-; ASM-NEXT:     ldrh    r0, [sp, #16]
-; DIS-NEXT:  84:        e1dd01b0
-; IASM-NEXT:    .byte 0xb0
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0xdd
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     strh    r0, [sp, #4]
-; ASM-NEXT:     # [sp, #4] = def.pseudo
-; DIS-NEXT:  88:        e1cd00b4
-; IASM-NEXT:    .byte 0xb4
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xcd
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     ldr     r0, [sp, #24]
-; DIS-NEXT:  8c:        e59d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp]
-; ASM-NEXT:     # [sp] = def.pseudo
-; DIS-NEXT:  90:        e58d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     dmb     sy
-; DIS-NEXT:  94:        f57ff05f
-; IASM-NEXT:    .byte 0x5f
-; IASM-NEXT:    .byte 0xf0
-; IASM-NEXT:    .byte 0x7f
-; IASM-NEXT:    .byte 0xf5
-
-; ASM-NEXT:.LtestI16Form$local$__0:
-; IASM-NEXT:.LtestI16Form$local$__0:
-
-; ASM-NEXT:     ldr     r0, [sp]
-; DIS-NEXT:  98:        e59d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldrh    r1, [sp, #4]
-; DIS-NEXT:  9c:        e1dd10b4
-; IASM-NEXT:    .byte 0xb4
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0xdd
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     uxth    r1, r1
-; DIS-NEXT:  a0:        e6ff1071
-; IASM-NEXT:    .byte 0x71
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0xff
-; IASM-NEXT:    .byte 0xe6
-
 ; ***** Example of ldrexh *****
-; ASM-NEXT:     ldrexh  r2, [r0]
-; DIS-NEXT:  a4:        e1f02f9f
-; IASM-NEXT:    .byte 0x9f
-; IASM-NEXT:    .byte 0x2f
-; IASM-NEXT:    .byte 0xf0
+; ASM:          ldrexh  r1, [r2]
+; DIS:     84:  e1f21f9f
+; IASM:         .byte 0x9f
+; IASM-NEXT:    .byte 0x1f
+; IASM-NEXT:    .byte 0xf2
 ; IASM-NEXT:    .byte 0xe1
 
-; ASM-NEXT:     add     r1, r2, r1
-; ASM-NEXT:     # r3 = def.pseudo
-; DIS-NEXT:  a8:        e0821001
-; IASM-NEXT:        .byte 0x1
-; IASM-NEXT:        .byte 0x10
-; IASM-NEXT:        .byte 0x82
-; IASM-NEXT:        .byte 0xe0
-
 ; ***** Example of strexh *****
-; ASM-NEXT:     strexh  r3, r1, [r0]
-; DIS-NEXT:  ac:        e1e03f91
-; IASM-NEXT:    .byte 0x91
-; IASM-NEXT:    .byte 0x3f
-; IASM-NEXT:    .byte 0xe0
+; ASM:          strexh  r4, r3, [r2]
+; DIS:     8c:  e1e24f93
+; IASM:         .byte 0x93
+; IASM-NEXT:    .byte 0x4f
+; IASM-NEXT:    .byte 0xe2
 ; IASM-NEXT:    .byte 0xe1
 
   %retval = zext i16 %v to i32
@@ -312,116 +99,28 @@
 
 define internal i32 @testI32Form(i32 %ptr, i32 %a) {
 ; ASM-LABEL:testI32Form:
-; DIS-LABEL:000000e0 <testI32Form>:
+; DIS-LABEL:<testI32Form>:
 ; IASM-LABEL:testI32Form:
 
 entry:
-; ASM-NEXT:.LtestI32Form$entry:
-; IASM-NEXT:.LtestI32Form$entry:
-
-; ASM-NEXT:     sub     sp, sp, #20
-; DIS-NEXT:  e0:        e24dd014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
-
-; ASM-NEXT:     str     r0, [sp, #16]
-; ASM-NEXT:     # [sp, #16] = def.pseudo
-; DIS-NEXT:  e4:        e58d0010
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r1, [sp, #12]
-; ASM-NEXT:     # [sp, #12] = def.pseudo
-; DIS-NEXT:  e8:        e58d100c
-; IASM-NEXT:    .byte 0xc
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
   %ptr.asptr = inttoptr i32 %ptr to i32*
   %v = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr.asptr,
                                           i32 %a, i32 6)
 
-; ASM-NEXT:     ldr     r0, [sp, #12]
-; DIS-NEXT:  ec:        e59d000c
-; IASM-NEXT:    .byte 0xc
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp, #4]
-; ASM-NEXT:     # [sp, #4] = def.pseudo
-; DIS-NEXT:  f0:        e58d0004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r0, [sp, #16]
-; DIS-NEXT:  f4:        e59d0010
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp]
-; ASM-NEXT:     # [sp] = def.pseudo
-; DIS-NEXT:  f8:        e58d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     dmb     sy
-; DIS-NEXT:  fc:        f57ff05f
-; IASM-NEXT:    .byte 0x5f
-; IASM-NEXT:    .byte 0xf0
-; IASM-NEXT:    .byte 0x7f
-; IASM-NEXT:    .byte 0xf5
-
-; ASM-NEXT:.LtestI32Form$local$__0:
-; IASM-NEXT:.LtestI32Form$local$__0:
-
-; ASM-NEXT:     ldr     r0, [sp]
-; DIS-NEXT: 100:        e59d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r1, [sp, #4]
-; DIS-NEXT: 104:        e59d1004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
 ; ***** Example of ldrex *****
-; ASM-NEXT:     ldrex   r2, [r0]
-; DIS-NEXT: 108:        e1902f9f
-; IASM-NEXT:    .byte 0x9f
-; IASM-NEXT:    .byte 0x2f
-; IASM-NEXT:    .byte 0x90
+; ASM:          ldrex   r1, [r2]
+; DIS:     dc:  e1921f9f
+; IASM:         .byte 0x9f
+; IASM-NEXT:    .byte 0x1f
+; IASM-NEXT:    .byte 0x92
 ; IASM-NEXT:    .byte 0xe1
 
-; ASM-NEXT:     add     r1, r2, r1
-; ASM-NEXT:     # r3 = def.pseudo
-; DIS-NEXT: 10c:        e0821001
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x82
-; IASM-NEXT:    .byte 0xe0
-
 ; ***** Example of strex *****
-; ASM-NEXT:     strex   r3, r1, [r0]
-; DIS-NEXT: 110:        e1803f91
-; IASM-NEXT:    .byte 0x91
-; IASM-NEXT:    .byte 0x3f
-; IASM-NEXT:    .byte 0x80
+; ASM:          strex   r4, r3, [r2]
+; DIS:     e4:  e1824f93
+; IASM:         .byte 0x93
+; IASM-NEXT:    .byte 0x4f
+; IASM-NEXT:    .byte 0x82
 ; IASM-NEXT:    .byte 0xe1
 
   ret i32 %v
@@ -429,193 +128,28 @@
 
 define internal i64 @testI64Form(i32 %ptr, i64 %a) {
 ; ASM-LABEL:testI64Form:
-; DIS-LABEL:00000130 <testI64Form>:
+; DIS-LABEL:<testI64Form>:
 ; IASM-LABEL:testI64Form:
 
 entry:
-; ASM-NEXT:.LtestI64Form$entry:
-; IASM-NEXT:.LtestI64Form$entry:
-
-; ASM-NEXT:     push    {r4, r5}
-; DIS-NEXT: 130:        e92d0030
-; IASM-NEXT:    .byte 0x30
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x2d
-; IASM-NEXT:    .byte 0xe9
-
-; ASM-NEXT:     sub     sp, sp, #32
-; DIS-NEXT: 134:        e24dd020
-; IASM-NEXT:    .byte 0x20
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
-
-; ASM-NEXT:     str     r0, [sp, #28]
-; ASM-NEXT:     # [sp, #28] = def.pseudo
-; DIS-NEXT: 138:        e58d001c
-; IASM-NEXT:    .byte 0x1c
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     mov     r0, r2
-; DIS-NEXT: 13c:        e1a00002
-; IASM-NEXT:    .byte 0x2
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     str     r0, [sp, #24]
-; ASM-NEXT:     # [sp, #24] = def.pseudo
-; DIS-NEXT: 140:        e58d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     mov     r0, r3
-; DIS-NEXT: 144:        e1a00003
-; IASM-NEXT:    .byte 0x3
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     str     r0, [sp, #20]
-; ASM-NEXT:     # [sp, #20] = def.pseudo
-; ASM-NEXT:     # [sp] = def.pseudo
-; DIS-NEXT: 148:        e58d0014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
   %ptr.asptr = inttoptr i32 %ptr to i64*
   %v = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr.asptr,
                                           i64 %a, i32 6)
 
-; ASM-NEXT:     ldr     r0, [sp, #24]
-; DIS-NEXT: 14c:        e59d0018
-; IASM-NEXT:    .byte 0x18
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp, #8]
-; ASM-NEXT:     # [sp, #8] = def.pseudo
-; DIS-NEXT: 150:        e58d0008
-; IASM-NEXT:    .byte 0x8
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r0, [sp, #20]
-; DIS-NEXT: 154:        e59d0014
-; IASM-NEXT:    .byte 0x14
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp, #4]
-; ASM-NEXT:     # [sp, #4] = def.pseudo
-; DIS-NEXT: 158:        e58d0004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r0, [sp, #28]
-; DIS-NEXT: 15c:        e59d001c
-; IASM-NEXT:    .byte 0x1c
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     str     r0, [sp]
-; ASM-NEXT:     # [sp] = def.pseudo
-; DIS-NEXT: 160:        e58d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x8d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     dmb     sy
-; DIS-NEXT: 164:        f57ff05f
-; IASM-NEXT:    .byte 0x5f
-; IASM-NEXT:    .byte 0xf0
-; IASM-NEXT:    .byte 0x7f
-; IASM-NEXT:    .byte 0xf5
-
-; ASM-NEXT:.LtestI64Form$local$__0:
-; IASM-NEXT:.LtestI64Form$local$__0:
-
-; ASM-NEXT:     ldr     r0, [sp]
-; ASM-NEXT:     # r2, r3 = def.pseudo [sp]
-; DIS-NEXT: 168:        e59d0000
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     ldr     r1, [sp, #8]
-; DIS-NEXT: 16c:        e59d1008
-; IASM-NEXT:    .byte 0x8
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     mov     r2, r1
-; DIS-NEXT: 170:        e1a02001
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0x20
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
-
-; ASM-NEXT:     ldr     r1, [sp, #4]
-; DIS-NEXT: 174:        e59d1004
-; IASM-NEXT:    .byte 0x4
-; IASM-NEXT:    .byte 0x10
-; IASM-NEXT:    .byte 0x9d
-; IASM-NEXT:    .byte 0xe5
-
-; ASM-NEXT:     mov     r3, r1
-; DIS-NEXT: 178:        e1a03001
-; IASM-NEXT:    .byte 0x1
-; IASM-NEXT:    .byte 0x30
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
-
 ; ***** Example of ldrexd *****
-; ASM-NEXT:     ldrexd  r4, r5, [r0]
-; ASM-NEXT:     # r4 = def.pseudo r4, r5
-; ASM-NEXT:     # r5 = def.pseudo r4, r5
-; ASM-NEXT:     # r2, r3 = def.pseudo r2, r3
-; DIS-NEXT: 17c:        e1b04f9f
-; IASM-NEXT:    .byte 0x9f
+; ASM:          ldrexd  r4, r5, [r6]
+; DIS:     13c: e1b64f9f
+; IASM:         .byte 0x9f
 ; IASM-NEXT:    .byte 0x4f
-; IASM-NEXT:    .byte 0xb0
+; IASM-NEXT:    .byte 0xb6
 ; IASM-NEXT:    .byte 0xe1
 
-; ASM-NEXT:     adds    r2, r4, r2
-; DIS-NEXT: 180:        e0942002
-; IASM-NEXT:    .byte 0x2
-; IASM-NEXT:    .byte 0x20
-; IASM-NEXT:    .byte 0x94
-; IASM-NEXT:    .byte 0xe0
-
-; ASM-NEXT:     adc     r3, r5, r3
-; ASM-NEXT:     # r1 = def.pseudo
-; DIS-NEXT: 184:        e0a53003
-; IASM-NEXT:    .byte 0x3
-; IASM-NEXT:    .byte 0x30
-; IASM-NEXT:    .byte 0xa5
-; IASM-NEXT:    .byte 0xe0
-
 ; ***** Example of strexd *****
-; ASM-NEXT:     strexd  r1, r2, r3, [r0]
-; DIS-NEXT: 188:        e1a01f92
-; IASM-NEXT:    .byte 0x92
-; IASM-NEXT:    .byte 0x1f
-; IASM-NEXT:    .byte 0xa0
+; ASM:          strexd  r4, r0, r1, [r6]
+; DIS:     158: e1a64f90
+; IASM:         .byte 0x90
+; IASM-NEXT:    .byte 0x4f
+; IASM-NEXT:    .byte 0xa6
 ; IASM-NEXT:    .byte 0xe1
 
   ret i64 %v
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index 12d4267..42a6ed8 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -246,10 +246,10 @@
 ; CHECK: movq QWORD {{.*}},x{{.*}}
 ; CHECK: mfence
 ; ARM32-LABEL: test_atomic_store_64_const
-; ARM32: dmb
 ; ARM32: movw [[T0:r[0-9]+]], #12274
 ; ARM32: movt [[T0]], #29646
 ; ARM32: movw r{{[0-9]+}}, #2874
+; ARM32: dmb
 ; ARM32: .L[[RETRY:.*]]:
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [[MEM:.*]]
 ; ARM32: strexd [[S:r[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}, [[MEM]]
@@ -342,7 +342,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: adds
-; ARM32-NEXT: adc
+; ARM32: adc
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -359,7 +359,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: adds
-; ARM32-NEXT: adc
+; ARM32: adc
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -400,7 +400,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: adds
-; ARM32-NEXT: adc
+; ARM32: adc
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -457,7 +457,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: adds
-; ARM32-NEXT: adc
+; ARM32: adc
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -543,7 +543,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: subs
-; ARM32-NEXT: sbc
+; ARM32: sbc
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -602,9 +602,9 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_or_8_global
 ; ARM32-LABEL: test_atomic_rmw_or_8_global
+; ARM32: dmb
 ; ARM32: movw [[PTR:r[0-9]+]], #:lower16:SzGlobal8
 ; ARM32: movt [[PTR]], #:upper16:SzGlobal8
-; ARM32: dmb
 ; ARM32: ldrexb r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
 ; ARM32: orr
 ; ARM32: strexb
@@ -643,9 +643,9 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_or_16_global
 ; ARM32-LABEL: test_atomic_rmw_or_16_global
+; ARM32: dmb
 ; ARM32: movw [[PTR:r[0-9]+]], #:lower16:SzGlobal16
 ; ARM32: movt [[PTR]], #:upper16:SzGlobal16
-; ARM32: dmb
 ; ARM32: ldrexh r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
 ; ARM32: orr
 ; ARM32: strexh
@@ -680,9 +680,9 @@
 }
 ; CHECK-LABEL: test_atomic_rmw_or_32_global
 ; ARM32-LABEL: test_atomic_rmw_or_32_global
+; ARM32: dmb
 ; ARM32: movw [[PTR:r[0-9]+]], #:lower16:SzGlobal32
 ; ARM32: movt [[PTR]], #:upper16:SzGlobal32
-; ARM32: dmb
 ; ARM32: ldrex r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
 ; ARM32: orr
 ; ARM32: strex
@@ -709,7 +709,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: orr
-; ARM32-NEXT: orr
+; ARM32: orr
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -819,7 +819,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: and
-; ARM32-NEXT: and
+; ARM32: and
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -927,7 +927,7 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: eor
-; ARM32-NEXT: eor
+; ARM32: eor
 ; ARM32: strexd r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, [r{{[0-9]+}}]
 ; ARM32: bne
 ; ARM32: dmb
@@ -1067,8 +1067,8 @@
 ; ARM32: dmb
 ; ARM32: ldrexb
 ; ARM32: cmp
+; ARM32: {{strb|mov}}
 ; ARM32: strexbeq
-; ARM32: {{strb|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1091,8 +1091,8 @@
 ; ARM32: dmb
 ; ARM32: ldrexh
 ; ARM32: cmp
+; ARM32: {{strh|mov}}
 ; ARM32: strexheq
-; ARM32: {{strh|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1112,8 +1112,8 @@
 ; ARM32: dmb
 ; ARM32: ldrex
 ; ARM32: cmp
+; ARM32: {{str|mov}}
 ; ARM32: strexeq
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1140,10 +1140,10 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR:r[0-9]+]]{{[]]}}
 ; ARM32: cmp
-; ARM32-NEXT: cmpeq
+; ARM32: cmpeq
+; ARM32: mov
+; ARM32: mov
 ; ARM32: strexdeq r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
-; ARM32: {{str|mov}}ne
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1163,10 +1163,10 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR:r[0-9]+]]{{[]]}}
 ; ARM32: cmp
-; ARM32-NEXT: cmpeq
+; ARM32: cmpeq
+; ARM32: mov
+; ARM32: mov
 ; ARM32: strexdeq r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
-; ARM32: {{str|mov}}ne
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1195,10 +1195,10 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR:r[0-9]+]]{{[]]}}
 ; ARM32: cmp
-; ARM32-NEXT: cmpeq
+; ARM32: cmpeq
+; ARM32: mov
+; ARM32: mov
 ; ARM32: strexdeq r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
-; ARM32: {{str|mov}}ne
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1241,10 +1241,10 @@
 ; ARM32: dmb
 ; ARM32: ldrexd r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR:r[0-9]+]]{{[]]}}
 ; ARM32: cmp
-; ARM32-NEXT: cmpeq
+; ARM32: cmpeq
+; ARM32: mov
+; ARM32: mov
 ; ARM32: strexdeq r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}, {{[[]}}[[PTR]]{{[]]}}
-; ARM32: {{str|mov}}ne
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb
@@ -1265,7 +1265,6 @@
 ; ARM32: ldrex
 ; ARM32: cmp
 ; ARM32: strexeq
-; ARM32: {{str|mov}}ne
 ; ARM32: cmpeq
 ; ARM32: bne
 ; ARM32: dmb