First pass at emitIAS for branches and binding labels

Currently not testing fixups of forward branches and
instead streaming a ".byte (foo - (. + 1))" or
".long (foo - (. + 4))". It should be supported once
emitIAS() delays writing things out until after the
function is fully emitted (and therefore forward labels
have all been bound).

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/673543002
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 414a8e1..606f785 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -94,7 +94,7 @@
   template <typename T> T *getAssembler() const {
     return static_cast<T *>(TargetAssembler.get());
   }
-  bool UseIntegratedAssembler() const {
+  bool useIntegratedAssembler() const {
     return getContext()->getFlags().UseIntegratedAssembler;
   }
   bool hasComputedFrame() const;
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 4e4d36b..d715767 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "assembler.h"
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceInst.h"
@@ -491,6 +492,10 @@
     Str << Func->getContext()->mangleName(Func->getFunctionName()) << ":\n";
   }
   Str << getAsmName() << ":\n";
+  if (Func->useIntegratedAssembler()) {
+    Assembler *Asm = Func->getAssembler<Assembler>();
+    Asm->BindCfgNodeLabel(getIndex());
+  }
   for (InstPhi *Phi : Phis) {
     if (Phi->isDeleted())
       continue;
@@ -505,7 +510,7 @@
     // suppress them.
     if (I->isRedundantAssign())
       continue;
-    if (Func->UseIntegratedAssembler()) {
+    if (Func->useIntegratedAssembler()) {
       I->emitIAS(Func);
     } else {
       I->emit(Func);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 9419866..ae4a30d 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -373,6 +373,33 @@
   }
 }
 
+void emitIASBytesBranch(const Cfg *Func, const x86::AssemblerX86 *Asm,
+                        intptr_t StartPosition, const x86::Label *Label,
+                        const IceString &LabelName, bool Near) {
+  // If this is a backward branch (label is bound), we're good and know
+  // the offset. If this is a forward branch, then we can't actually emit
+  // the thing as text in a streaming manner, because the fixup hasn't
+  // happened yet. Instead, emit .long ($BranchLabel) - (. + 4), in that
+  // case and let the external assembler take care of that fixup.
+  if (Label->IsBound()) {
+    emitIASBytes(Func, Asm, StartPosition);
+    return;
+  }
+  const intptr_t FwdBranchSize = Near ? 1 : 4;
+  const IceString FwdBranchDirective = Near ? ".byte" : ".long";
+  Ostream &Str = Func->getContext()->getStrEmit();
+  intptr_t EndPosition = Asm->GetPosition();
+  assert(EndPosition - StartPosition > FwdBranchSize);
+  for (intptr_t i = StartPosition; i < EndPosition - FwdBranchSize; ++i) {
+    Str << "\t.byte 0x";
+    Str.write_hex(Asm->LoadBuffer<uint8_t>(i));
+    Str << "\n";
+  }
+  Str << "\t" << FwdBranchDirective << " " << LabelName << " - (. + "
+      << FwdBranchSize << ")\n";
+  return;
+}
+
 } // end of anonymous namespace
 
 void InstX8632::dump(const Cfg *Func) const {
@@ -386,6 +413,15 @@
   Str << getName(Func) << ":\n";
 }
 
+void InstX8632Label::emitIAS(const Cfg *Func) const {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  Asm->BindLocalLabel(Number);
+  // TODO(jvoung): remove the the textual label once forward branch
+  // fixups are used (and text assembler is not used).
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << getName(Func) << ":\n";
+}
+
 void InstX8632Label::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << getName(Func) << ":";
@@ -415,6 +451,47 @@
   }
 }
 
+void InstX8632Br::emitIAS(const Cfg *Func) const {
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  intptr_t StartPosition = Asm->GetPosition();
+  if (Label) {
+    x86::Label *L = Asm->GetOrCreateLocalLabel(Label->getNumber());
+    // In all these cases, local Labels should only be used for Near.
+    const bool Near = true;
+    if (Condition == CondX86::Br_None) {
+      Asm->jmp(L, Near);
+    } else {
+      Asm->j(Condition, L, Near);
+    }
+    emitIASBytesBranch(Func, Asm, StartPosition, L, Label->getName(Func), Near);
+  } else {
+    // Pessimistically assume it's far. This only affects Labels that
+    // are not Bound.
+    const bool Near = false;
+    if (Condition == CondX86::Br_None) {
+      x86::Label *L =
+          Asm->GetOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+      assert(!getTargetTrue());
+      Asm->jmp(L, Near);
+      emitIASBytesBranch(Func, Asm, StartPosition, L,
+                         getTargetFalse()->getAsmName(), Near);
+    } else {
+      x86::Label *L = Asm->GetOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
+      Asm->j(Condition, L, Near);
+      emitIASBytesBranch(Func, Asm, StartPosition, L,
+                         getTargetTrue()->getAsmName(), Near);
+      StartPosition = Asm->GetPosition();
+      if (getTargetFalse()) {
+        x86::Label *L2 =
+            Asm->GetOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+        Asm->jmp(L2, Near);
+        emitIASBytesBranch(Func, Asm, StartPosition, L2,
+                           getTargetFalse()->getAsmName(), Near);
+      }
+    }
+  }
+}
+
 void InstX8632Br::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "br ";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 8235d1f..81d3fd8 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -274,13 +274,14 @@
   }
 };
 
-// InstX8632Label represents an intra-block label that is the
-// target of an intra-block branch.  These are used for lowering i1
-// calculations, Select instructions, and 64-bit compares on a 32-bit
-// architecture, without basic block splitting.  Basic block splitting
-// is not so desirable for several reasons, one of which is the impact
-// on decisions based on whether a variable's live range spans
-// multiple basic blocks.
+// InstX8632Label represents an intra-block label that is the target
+// of an intra-block branch.  The offset between the label and the
+// branch must be fit into one byte (considered "near").  These are
+// used for lowering i1 calculations, Select instructions, and 64-bit
+// compares on a 32-bit architecture, without basic block splitting.
+// Basic block splitting is not so desirable for several reasons, one
+// of which is the impact on decisions based on whether a variable's
+// live range spans multiple basic blocks.
 //
 // Intra-block control flow must be used with caution.  Consider the
 // sequence for "c = (a >= b ? x : y)".
@@ -321,15 +322,15 @@
   }
   uint32_t getEmitInstCount() const override { return 0; }
   IceString getName(const Cfg *Func) const;
+  SizeT getNumber() const { return Number; }
   void emit(const Cfg *Func) const override;
-  // TODO(jvoung): Filler in.
-  void emitIAS(const Cfg *Func) const override { emit(Func); }
+  void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
 
 private:
   InstX8632Label(Cfg *Func, TargetX8632 *Target);
   ~InstX8632Label() override {}
-  SizeT Number; // used only for unique label string generation
+  SizeT Number; // used for unique label generation.
 };
 
 // Conditional and unconditional branch instruction.
@@ -385,8 +386,7 @@
     return Sum;
   }
   void emit(const Cfg *Func) const override;
-  // TODO(jvoung): Filler in.
-  void emitIAS(const Cfg *Func) const override { emit(Func); }
+  void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Br); }
 
diff --git a/src/assembler.h b/src/assembler.h
index d769b3a..d3cd9bf 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -207,7 +207,7 @@
 
 public:
   Assembler() {}
-  ~Assembler() {}
+  virtual ~Assembler() {}
 
   // Allocate a chunk of bytes using the per-Assembler allocator.
   uintptr_t AllocateBytes(size_t bytes) {
@@ -224,6 +224,8 @@
   // Allocate data of type T using the per-Assembler allocator.
   template <typename T> T *Allocate() { return Allocator.Allocate<T>(); }
 
+  virtual void BindCfgNodeLabel(SizeT NodeNumber) = 0;
+
 private:
   llvm::BumpPtrAllocator Allocator;
 };
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 2b80820..11ff859 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -67,6 +67,53 @@
   return x86::Address::Absolute(Fixup);
 }
 
+AssemblerX86::~AssemblerX86() {
+#ifndef NDEBUG
+  for (const Label *Label : CfgNodeLabels) {
+    Label->FinalCheck();
+  }
+  for (const Label *Label : LocalLabels) {
+    Label->FinalCheck();
+  }
+#endif
+}
+
+Label *AssemblerX86::GetOrCreateLabel(SizeT Number, LabelVector &Labels) {
+  Label *L = nullptr;
+  if (Number == Labels.size()) {
+    L = new (this->Allocate<Label>()) Label();
+    Labels.push_back(L);
+    return L;
+  }
+  if (Number > Labels.size()) {
+    Labels.resize(Number + 1);
+  }
+  L = Labels[Number];
+  if (!L) {
+    L = new (this->Allocate<Label>()) Label();
+    Labels[Number] = L;
+  }
+  return L;
+}
+
+Label *AssemblerX86::GetOrCreateCfgNodeLabel(SizeT NodeNumber) {
+  return GetOrCreateLabel(NodeNumber, CfgNodeLabels);
+}
+
+Label *AssemblerX86::GetOrCreateLocalLabel(SizeT Number) {
+  return GetOrCreateLabel(Number, LocalLabels);
+}
+
+void AssemblerX86::BindCfgNodeLabel(SizeT NodeNumber) {
+  Label *L = GetOrCreateCfgNodeLabel(NodeNumber);
+  this->Bind(L);
+}
+
+void AssemblerX86::BindLocalLabel(SizeT Number) {
+  Label *L = GetOrCreateLocalLabel(Number);
+  this->Bind(L);
+}
+
 void AssemblerX86::call(GPRRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xFF);
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index bb3f0ab..b1079d1 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -284,7 +284,9 @@
 #endif // !NDEBUG
   }
 
-  ~Label() {
+  ~Label() {}
+
+  void FinalCheck() const {
     // Assert if label is being destroyed with unresolved branches pending.
     assert(!IsLinked());
     assert(!HasNear());
@@ -363,11 +365,16 @@
     assert(!use_far_branches);
     (void)use_far_branches;
   }
-  ~AssemblerX86() {}
+  ~AssemblerX86() override;
 
   static const bool kNearJump = true;
   static const bool kFarJump = false;
 
+  Label *GetOrCreateCfgNodeLabel(SizeT NodeNumber);
+  void BindCfgNodeLabel(SizeT NodeNumber) override;
+  Label *GetOrCreateLocalLabel(SizeT Number);
+  void BindLocalLabel(SizeT Number);
+
   // Operations to emit GPR instructions (and dispatch on operand type).
   typedef void (AssemblerX86::*TypedEmitGPR)(Type, GPRRegister);
   typedef void (AssemblerX86::*TypedEmitAddr)(Type, const Address &);
@@ -848,6 +855,14 @@
   void EmitGenericShift(int rm, Type Ty, const Operand &operand,
                         GPRRegister shifter);
 
+  typedef std::vector<Label *> LabelVector;
+  // A vector of pool-allocated x86 labels for CFG nodes.
+  LabelVector CfgNodeLabels;
+  // A vector of pool-allocated x86 labels for Local labels.
+  LabelVector LocalLabels;
+
+  Label *GetOrCreateLabel(SizeT Number, LabelVector &Labels);
+
   AssemblerBuffer buffer_;
 };
 
diff --git a/tests_lit/assembler/x86/jump_encodings.ll b/tests_lit/assembler/x86/jump_encodings.ll
new file mode 100644
index 0000000..130d59f
--- /dev/null
+++ b/tests_lit/assembler/x86/jump_encodings.ll
@@ -0,0 +1,218 @@
+; Tests various aspects of x86 branch encodings (near vs far,
+; forward vs backward, using CFG labels, or local labels).
+
+; Use -ffunction-sections so that the offsets reset for each function.
+; RUN: %p2i -i %s --args -O2 --verbose none -ffunction-sections \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+
+; Use atomic ops as filler, which shouldn't get optimized out.
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+
+define void @test_near_backward(i32 %iptr, i32 %val) {
+entry:
+  br label %next
+next:
+  %ptr = inttoptr i32 %iptr to i32*
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  br label %next2
+next2:
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  %cmp = icmp ult i32 %val, 0
+  br i1 %cmp, label %next2, label %next
+}
+
+; CHECK-LABEL: test_near_backward
+; CHECK:      8: {{.*}}  mov dword ptr
+; CHECK-NEXT: a: {{.*}}  mfence
+; CHECK-NEXT: d: {{.*}}  mov dword ptr
+; CHECK-NEXT: f: {{.*}}  mfence
+; CHECK-NEXT: 12: {{.*}} cmp
+; (0x15 + 2) - 10 == 0xd
+; CHECK-NEXT: 15: 72 f6 jb -10
+; (0x17 + 2) - 17 == 0x8
+; CHECK-NEXT: 17: eb ef jmp -17
+
+; Test one of the backward branches being too large for 8 bits
+; and one being just okay.
+define void @test_far_backward1(i32 %iptr, i32 %val) {
+entry:
+  br label %next
+next:
+  %ptr = inttoptr i32 %iptr to i32*
+  %tmp = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  br label %next2
+next2:
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  %cmp = icmp ugt i32 %val, 0
+  br i1 %cmp, label %next2, label %next
+}
+
+; CHECK-LABEL: test_far_backward1
+; CHECK:      8: {{.*}}  mov {{.*}}, dword ptr [e{{[^s]}}
+; CHECK-NEXT: a: {{.*}}  mov dword ptr
+; CHECK-NEXT: c: {{.*}}  mfence
+; (0x85 + 2) - 125 == 0xa
+; CHECK: 85: 77 83 ja -125
+; (0x87 + 5) - 132 == 0x8
+; CHECK-NEXT: 87: e9 7c ff ff ff jmp -132
+
+; Same as test_far_backward1, but with the conditional branch being
+; the one that is too far.
+define void @test_far_backward2(i32 %iptr, i32 %val) {
+entry:
+  br label %next
+next:
+  %ptr = inttoptr i32 %iptr to i32*
+  %tmp = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %tmp2 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %tmp3 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %tmp4 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %tmp5 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  br label %next2
+next2:
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp2, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp3, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp4, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %tmp5, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  %cmp = icmp sle i32 %val, 0
+  br i1 %cmp, label %next, label %next2
+}
+
+; CHECK-LABEL: test_far_backward2
+; CHECK:      c:  {{.*}}  mov {{.*}}, dword ptr [e{{[^s]}}
+; CHECK:      14: {{.*}}  mov {{.*}}, dword ptr
+; CHECK-NEXT: 16: {{.*}}  mov dword ptr
+; CHECK-NEXT: 18: {{.*}}  mfence
+; (0x8c + 6) - 134 == 0xc
+; CHECK: 8c: 0f 8e 7a ff ff ff jle -134
+; (0x92 + 2) - 126 == 0x16
+; CHECK-NEXT: 92: eb 82 jmp -126
+
+define void @test_near_forward(i32 %iptr, i32 %val) {
+entry:
+  br label %next1
+next1:
+  %ptr = inttoptr i32 %iptr to i32*
+  %cmp = icmp ult i32 %val, 0
+  br i1 %cmp, label %next3, label %next2
+next2:
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  br label %next3
+next3:
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  br label %next1
+}
+; Forward branches for non-local labels currently use the fully relaxed
+; form to avoid needing a relaxation pass.
+; CHECK-LABEL: test_near_forward
+; CHECK:       8: {{.*}}            cmp
+; CHECK-NEXT:  b: 0f 82 05 00 00 00 jb 5
+; CHECK-NEXT: 11: {{.*}}            mov dword ptr
+; CHECK-NEXT: 13: {{.*}}            mfence
+; Forward branch is 5 bytes ahead to here.
+; CHECK-NEXT: 16: {{.*}}            mov dword ptr
+; Jumps back to (0x1b + 2) - 21 == 0x8 (to before the forward branch,
+; therefore knowing that the forward branch was indeed 6 bytes).
+; CHECK:      1b: eb eb             jmp -21
+
+
+; Unlike forward branches to cfg nodes, "local" forward branches
+; always use a 1 byte displacement.
+; Check local forward branches, followed by a near backward branch
+; to make sure that the instruction size accounting for the forward
+; branches are correct, by the time the backward branch is hit.
+; A 64-bit compare happens to use local forward branches.
+define void @test_local_forward_then_back(i64 %val64, i32 %iptr, i32 %val) {
+entry:
+  br label %next
+next:
+  %ptr = inttoptr i32 %iptr to i32*
+  call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+  br label %next2
+next2:
+  %cmp = icmp ult i64 %val64, 0
+  br i1 %cmp, label %next, label %next2
+}
+; CHECK-LABEL: test_local_forward_then_back
+; CHECK:      14: {{.*}} mov dword ptr
+; CHECK-NEXT: 16: {{.*}} mfence
+; CHECK-NEXT: 19: {{.*}} mov dword ptr {{.*}}, 1
+; CHECK-NEXT: 20: {{.*}} cmp
+; CHECK-NEXT: 23: {{.*}} jb 14
+; (0x37 + 2) - 37 == 0x14
+; CHECK:      37: {{.*}} jne -37
+; (0x39 + 2) - 34 == 0x19
+; CHECK:      39: {{.*}} jmp -34
+
+
+; Test that backward local branches also work and are small.
+; Some of the atomic instructions use a cmpxchg loop.
+define void @test_local_backward(i64 %val64, i32 %iptr, i32 %val) {
+entry:
+  br label %next
+next:
+  %ptr = inttoptr i32 %iptr to i32*
+  %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %val, i32 6)
+  br label %next2
+next2:
+  %success = icmp eq i32 1, %a
+  br i1 %success, label %next, label %next2
+}
+; CHECK-LABEL: test_local_backward
+; CHECK:       9: {{.*}} mov {{.*}}, dword
+; CHECK:       b: {{.*}} mov
+; CHECK-NEXT:  d: {{.*}} xor
+; CHECK-NEXT:  f: {{.*}} lock
+; CHECK-NEXT: 10: {{.*}} cmpxchg
+; (0x13 + 2) - 10 == 0xb
+; CHECK-NEXT: 13: 75 f6 jne -10
+; (0x1c + 2) - 21 == 0x9
+; CHECK:      1c: 74 eb je -21
+
+; ERRORS-NOT: ICE translation error