First pass at emitIAS for branches and binding labels
Currently not testing fixups of forward branches and
instead streaming a ".byte (foo - (. + 1))" or
".long (foo - (. + 4))". It should be supported once
emitIAS() delays writing things out until after the
function is fully emitted (and therefore forward labels
have all been bound).
BUG=none
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/673543002
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 414a8e1..606f785 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -94,7 +94,7 @@
template <typename T> T *getAssembler() const {
return static_cast<T *>(TargetAssembler.get());
}
- bool UseIntegratedAssembler() const {
+ bool useIntegratedAssembler() const {
return getContext()->getFlags().UseIntegratedAssembler;
}
bool hasComputedFrame() const;
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 4e4d36b..d715767 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "assembler.h"
#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceInst.h"
@@ -491,6 +492,10 @@
Str << Func->getContext()->mangleName(Func->getFunctionName()) << ":\n";
}
Str << getAsmName() << ":\n";
+ if (Func->useIntegratedAssembler()) {
+ Assembler *Asm = Func->getAssembler<Assembler>();
+ Asm->BindCfgNodeLabel(getIndex());
+ }
for (InstPhi *Phi : Phis) {
if (Phi->isDeleted())
continue;
@@ -505,7 +510,7 @@
// suppress them.
if (I->isRedundantAssign())
continue;
- if (Func->UseIntegratedAssembler()) {
+ if (Func->useIntegratedAssembler()) {
I->emitIAS(Func);
} else {
I->emit(Func);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 9419866..ae4a30d 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -373,6 +373,33 @@
}
}
+void emitIASBytesBranch(const Cfg *Func, const x86::AssemblerX86 *Asm,
+ intptr_t StartPosition, const x86::Label *Label,
+ const IceString &LabelName, bool Near) {
+ // If this is a backward branch (label is bound), we're good and know
+ // the offset. If this is a forward branch, then we can't actually emit
+ // the thing as text in a streaming manner, because the fixup hasn't
+ // happened yet. Instead, emit .long ($BranchLabel) - (. + 4), in that
+ // case and let the external assembler take care of that fixup.
+ if (Label->IsBound()) {
+ emitIASBytes(Func, Asm, StartPosition);
+ return;
+ }
+ const intptr_t FwdBranchSize = Near ? 1 : 4;
+ const IceString FwdBranchDirective = Near ? ".byte" : ".long";
+ Ostream &Str = Func->getContext()->getStrEmit();
+ intptr_t EndPosition = Asm->GetPosition();
+ assert(EndPosition - StartPosition > FwdBranchSize);
+ for (intptr_t i = StartPosition; i < EndPosition - FwdBranchSize; ++i) {
+ Str << "\t.byte 0x";
+ Str.write_hex(Asm->LoadBuffer<uint8_t>(i));
+ Str << "\n";
+ }
+ Str << "\t" << FwdBranchDirective << " " << LabelName << " - (. + "
+ << FwdBranchSize << ")\n";
+ return;
+}
+
} // end of anonymous namespace
void InstX8632::dump(const Cfg *Func) const {
@@ -386,6 +413,15 @@
Str << getName(Func) << ":\n";
}
+void InstX8632Label::emitIAS(const Cfg *Func) const {
+ x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+ Asm->BindLocalLabel(Number);
+ // TODO(jvoung): remove the the textual label once forward branch
+ // fixups are used (and text assembler is not used).
+ Ostream &Str = Func->getContext()->getStrEmit();
+ Str << getName(Func) << ":\n";
+}
+
void InstX8632Label::dump(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrDump();
Str << getName(Func) << ":";
@@ -415,6 +451,47 @@
}
}
+void InstX8632Br::emitIAS(const Cfg *Func) const {
+ x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+ intptr_t StartPosition = Asm->GetPosition();
+ if (Label) {
+ x86::Label *L = Asm->GetOrCreateLocalLabel(Label->getNumber());
+ // In all these cases, local Labels should only be used for Near.
+ const bool Near = true;
+ if (Condition == CondX86::Br_None) {
+ Asm->jmp(L, Near);
+ } else {
+ Asm->j(Condition, L, Near);
+ }
+ emitIASBytesBranch(Func, Asm, StartPosition, L, Label->getName(Func), Near);
+ } else {
+ // Pessimistically assume it's far. This only affects Labels that
+ // are not Bound.
+ const bool Near = false;
+ if (Condition == CondX86::Br_None) {
+ x86::Label *L =
+ Asm->GetOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+ assert(!getTargetTrue());
+ Asm->jmp(L, Near);
+ emitIASBytesBranch(Func, Asm, StartPosition, L,
+ getTargetFalse()->getAsmName(), Near);
+ } else {
+ x86::Label *L = Asm->GetOrCreateCfgNodeLabel(getTargetTrue()->getIndex());
+ Asm->j(Condition, L, Near);
+ emitIASBytesBranch(Func, Asm, StartPosition, L,
+ getTargetTrue()->getAsmName(), Near);
+ StartPosition = Asm->GetPosition();
+ if (getTargetFalse()) {
+ x86::Label *L2 =
+ Asm->GetOrCreateCfgNodeLabel(getTargetFalse()->getIndex());
+ Asm->jmp(L2, Near);
+ emitIASBytesBranch(Func, Asm, StartPosition, L2,
+ getTargetFalse()->getAsmName(), Near);
+ }
+ }
+ }
+}
+
void InstX8632Br::dump(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrDump();
Str << "br ";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 8235d1f..81d3fd8 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -274,13 +274,14 @@
}
};
-// InstX8632Label represents an intra-block label that is the
-// target of an intra-block branch. These are used for lowering i1
-// calculations, Select instructions, and 64-bit compares on a 32-bit
-// architecture, without basic block splitting. Basic block splitting
-// is not so desirable for several reasons, one of which is the impact
-// on decisions based on whether a variable's live range spans
-// multiple basic blocks.
+// InstX8632Label represents an intra-block label that is the target
+// of an intra-block branch. The offset between the label and the
+// branch must be fit into one byte (considered "near"). These are
+// used for lowering i1 calculations, Select instructions, and 64-bit
+// compares on a 32-bit architecture, without basic block splitting.
+// Basic block splitting is not so desirable for several reasons, one
+// of which is the impact on decisions based on whether a variable's
+// live range spans multiple basic blocks.
//
// Intra-block control flow must be used with caution. Consider the
// sequence for "c = (a >= b ? x : y)".
@@ -321,15 +322,15 @@
}
uint32_t getEmitInstCount() const override { return 0; }
IceString getName(const Cfg *Func) const;
+ SizeT getNumber() const { return Number; }
void emit(const Cfg *Func) const override;
- // TODO(jvoung): Filler in.
- void emitIAS(const Cfg *Func) const override { emit(Func); }
+ void emitIAS(const Cfg *Func) const override;
void dump(const Cfg *Func) const override;
private:
InstX8632Label(Cfg *Func, TargetX8632 *Target);
~InstX8632Label() override {}
- SizeT Number; // used only for unique label string generation
+ SizeT Number; // used for unique label generation.
};
// Conditional and unconditional branch instruction.
@@ -385,8 +386,7 @@
return Sum;
}
void emit(const Cfg *Func) const override;
- // TODO(jvoung): Filler in.
- void emitIAS(const Cfg *Func) const override { emit(Func); }
+ void emitIAS(const Cfg *Func) const override;
void dump(const Cfg *Func) const override;
static bool classof(const Inst *Inst) { return isClassof(Inst, Br); }
diff --git a/src/assembler.h b/src/assembler.h
index d769b3a..d3cd9bf 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -207,7 +207,7 @@
public:
Assembler() {}
- ~Assembler() {}
+ virtual ~Assembler() {}
// Allocate a chunk of bytes using the per-Assembler allocator.
uintptr_t AllocateBytes(size_t bytes) {
@@ -224,6 +224,8 @@
// Allocate data of type T using the per-Assembler allocator.
template <typename T> T *Allocate() { return Allocator.Allocate<T>(); }
+ virtual void BindCfgNodeLabel(SizeT NodeNumber) = 0;
+
private:
llvm::BumpPtrAllocator Allocator;
};
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index 2b80820..11ff859 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -67,6 +67,53 @@
return x86::Address::Absolute(Fixup);
}
+AssemblerX86::~AssemblerX86() {
+#ifndef NDEBUG
+ for (const Label *Label : CfgNodeLabels) {
+ Label->FinalCheck();
+ }
+ for (const Label *Label : LocalLabels) {
+ Label->FinalCheck();
+ }
+#endif
+}
+
+Label *AssemblerX86::GetOrCreateLabel(SizeT Number, LabelVector &Labels) {
+ Label *L = nullptr;
+ if (Number == Labels.size()) {
+ L = new (this->Allocate<Label>()) Label();
+ Labels.push_back(L);
+ return L;
+ }
+ if (Number > Labels.size()) {
+ Labels.resize(Number + 1);
+ }
+ L = Labels[Number];
+ if (!L) {
+ L = new (this->Allocate<Label>()) Label();
+ Labels[Number] = L;
+ }
+ return L;
+}
+
+Label *AssemblerX86::GetOrCreateCfgNodeLabel(SizeT NodeNumber) {
+ return GetOrCreateLabel(NodeNumber, CfgNodeLabels);
+}
+
+Label *AssemblerX86::GetOrCreateLocalLabel(SizeT Number) {
+ return GetOrCreateLabel(Number, LocalLabels);
+}
+
+void AssemblerX86::BindCfgNodeLabel(SizeT NodeNumber) {
+ Label *L = GetOrCreateCfgNodeLabel(NodeNumber);
+ this->Bind(L);
+}
+
+void AssemblerX86::BindLocalLabel(SizeT Number) {
+ Label *L = GetOrCreateLocalLabel(Number);
+ this->Bind(L);
+}
+
void AssemblerX86::call(GPRRegister reg) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0xFF);
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index bb3f0ab..b1079d1 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -284,7 +284,9 @@
#endif // !NDEBUG
}
- ~Label() {
+ ~Label() {}
+
+ void FinalCheck() const {
// Assert if label is being destroyed with unresolved branches pending.
assert(!IsLinked());
assert(!HasNear());
@@ -363,11 +365,16 @@
assert(!use_far_branches);
(void)use_far_branches;
}
- ~AssemblerX86() {}
+ ~AssemblerX86() override;
static const bool kNearJump = true;
static const bool kFarJump = false;
+ Label *GetOrCreateCfgNodeLabel(SizeT NodeNumber);
+ void BindCfgNodeLabel(SizeT NodeNumber) override;
+ Label *GetOrCreateLocalLabel(SizeT Number);
+ void BindLocalLabel(SizeT Number);
+
// Operations to emit GPR instructions (and dispatch on operand type).
typedef void (AssemblerX86::*TypedEmitGPR)(Type, GPRRegister);
typedef void (AssemblerX86::*TypedEmitAddr)(Type, const Address &);
@@ -848,6 +855,14 @@
void EmitGenericShift(int rm, Type Ty, const Operand &operand,
GPRRegister shifter);
+ typedef std::vector<Label *> LabelVector;
+ // A vector of pool-allocated x86 labels for CFG nodes.
+ LabelVector CfgNodeLabels;
+ // A vector of pool-allocated x86 labels for Local labels.
+ LabelVector LocalLabels;
+
+ Label *GetOrCreateLabel(SizeT Number, LabelVector &Labels);
+
AssemblerBuffer buffer_;
};
diff --git a/tests_lit/assembler/x86/jump_encodings.ll b/tests_lit/assembler/x86/jump_encodings.ll
new file mode 100644
index 0000000..130d59f
--- /dev/null
+++ b/tests_lit/assembler/x86/jump_encodings.ll
@@ -0,0 +1,218 @@
+; Tests various aspects of x86 branch encodings (near vs far,
+; forward vs backward, using CFG labels, or local labels).
+
+; Use -ffunction-sections so that the offsets reset for each function.
+; RUN: %p2i -i %s --args -O2 --verbose none -ffunction-sections \
+; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+
+; Use atomic ops as filler, which shouldn't get optimized out.
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+
+define void @test_near_backward(i32 %iptr, i32 %val) {
+entry:
+ br label %next
+next:
+ %ptr = inttoptr i32 %iptr to i32*
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ br label %next2
+next2:
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ %cmp = icmp ult i32 %val, 0
+ br i1 %cmp, label %next2, label %next
+}
+
+; CHECK-LABEL: test_near_backward
+; CHECK: 8: {{.*}} mov dword ptr
+; CHECK-NEXT: a: {{.*}} mfence
+; CHECK-NEXT: d: {{.*}} mov dword ptr
+; CHECK-NEXT: f: {{.*}} mfence
+; CHECK-NEXT: 12: {{.*}} cmp
+; (0x15 + 2) - 10 == 0xd
+; CHECK-NEXT: 15: 72 f6 jb -10
+; (0x17 + 2) - 17 == 0x8
+; CHECK-NEXT: 17: eb ef jmp -17
+
+; Test one of the backward branches being too large for 8 bits
+; and one being just okay.
+define void @test_far_backward1(i32 %iptr, i32 %val) {
+entry:
+ br label %next
+next:
+ %ptr = inttoptr i32 %iptr to i32*
+ %tmp = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ br label %next2
+next2:
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ %cmp = icmp ugt i32 %val, 0
+ br i1 %cmp, label %next2, label %next
+}
+
+; CHECK-LABEL: test_far_backward1
+; CHECK: 8: {{.*}} mov {{.*}}, dword ptr [e{{[^s]}}
+; CHECK-NEXT: a: {{.*}} mov dword ptr
+; CHECK-NEXT: c: {{.*}} mfence
+; (0x85 + 2) - 125 == 0xa
+; CHECK: 85: 77 83 ja -125
+; (0x87 + 5) - 132 == 0x8
+; CHECK-NEXT: 87: e9 7c ff ff ff jmp -132
+
+; Same as test_far_backward1, but with the conditional branch being
+; the one that is too far.
+define void @test_far_backward2(i32 %iptr, i32 %val) {
+entry:
+ br label %next
+next:
+ %ptr = inttoptr i32 %iptr to i32*
+ %tmp = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ %tmp2 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ %tmp3 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ %tmp4 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ %tmp5 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+ br label %next2
+next2:
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp2, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp3, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp4, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %tmp5, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ %cmp = icmp sle i32 %val, 0
+ br i1 %cmp, label %next, label %next2
+}
+
+; CHECK-LABEL: test_far_backward2
+; CHECK: c: {{.*}} mov {{.*}}, dword ptr [e{{[^s]}}
+; CHECK: 14: {{.*}} mov {{.*}}, dword ptr
+; CHECK-NEXT: 16: {{.*}} mov dword ptr
+; CHECK-NEXT: 18: {{.*}} mfence
+; (0x8c + 6) - 134 == 0xc
+; CHECK: 8c: 0f 8e 7a ff ff ff jle -134
+; (0x92 + 2) - 126 == 0x16
+; CHECK-NEXT: 92: eb 82 jmp -126
+
+define void @test_near_forward(i32 %iptr, i32 %val) {
+entry:
+ br label %next1
+next1:
+ %ptr = inttoptr i32 %iptr to i32*
+ %cmp = icmp ult i32 %val, 0
+ br i1 %cmp, label %next3, label %next2
+next2:
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ br label %next3
+next3:
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ br label %next1
+}
+; Forward branches for non-local labels currently use the fully relaxed
+; form to avoid needing a relaxation pass.
+; CHECK-LABEL: test_near_forward
+; CHECK: 8: {{.*}} cmp
+; CHECK-NEXT: b: 0f 82 05 00 00 00 jb 5
+; CHECK-NEXT: 11: {{.*}} mov dword ptr
+; CHECK-NEXT: 13: {{.*}} mfence
+; Forward branch is 5 bytes ahead to here.
+; CHECK-NEXT: 16: {{.*}} mov dword ptr
+; Jumps back to (0x1b + 2) - 21 == 0x8 (to before the forward branch,
+; therefore knowing that the forward branch was indeed 6 bytes).
+; CHECK: 1b: eb eb jmp -21
+
+
+; Unlike forward branches to cfg nodes, "local" forward branches
+; always use a 1 byte displacement.
+; Check local forward branches, followed by a near backward branch
+; to make sure that the instruction size accounting for the forward
+; branches are correct, by the time the backward branch is hit.
+; A 64-bit compare happens to use local forward branches.
+define void @test_local_forward_then_back(i64 %val64, i32 %iptr, i32 %val) {
+entry:
+ br label %next
+next:
+ %ptr = inttoptr i32 %iptr to i32*
+ call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
+ br label %next2
+next2:
+ %cmp = icmp ult i64 %val64, 0
+ br i1 %cmp, label %next, label %next2
+}
+; CHECK-LABEL: test_local_forward_then_back
+; CHECK: 14: {{.*}} mov dword ptr
+; CHECK-NEXT: 16: {{.*}} mfence
+; CHECK-NEXT: 19: {{.*}} mov dword ptr {{.*}}, 1
+; CHECK-NEXT: 20: {{.*}} cmp
+; CHECK-NEXT: 23: {{.*}} jb 14
+; (0x37 + 2) - 37 == 0x14
+; CHECK: 37: {{.*}} jne -37
+; (0x39 + 2) - 34 == 0x19
+; CHECK: 39: {{.*}} jmp -34
+
+
+; Test that backward local branches also work and are small.
+; Some of the atomic instructions use a cmpxchg loop.
+define void @test_local_backward(i64 %val64, i32 %iptr, i32 %val) {
+entry:
+ br label %next
+next:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %val, i32 6)
+ br label %next2
+next2:
+ %success = icmp eq i32 1, %a
+ br i1 %success, label %next, label %next2
+}
+; CHECK-LABEL: test_local_backward
+; CHECK: 9: {{.*}} mov {{.*}}, dword
+; CHECK: b: {{.*}} mov
+; CHECK-NEXT: d: {{.*}} xor
+; CHECK-NEXT: f: {{.*}} lock
+; CHECK-NEXT: 10: {{.*}} cmpxchg
+; (0x13 + 2) - 10 == 0xb
+; CHECK-NEXT: 13: 75 f6 jne -10
+; (0x1c + 2) - 21 == 0x9
+; CHECK: 1c: 74 eb je -21
+
+; ERRORS-NOT: ICE translation error