Subzero: Add sandboxing for x86-32.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4079
R=jvoung@chromium.org, kschimpf@google.com

Review URL: https://codereview.chromium.org/930733002
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index bae6c77..2900a76 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -436,6 +436,8 @@
   for (uint8_t I : Asm->getNonExecBundlePadding())
     Str.write_hex(I);
   Str << "\n";
+  if (Ctx->getFlags().getUseSandboxing())
+    Str << "\t.bundle_align_mode " << Asm->getBundleAlignLog2Bytes() << "\n";
   Str << MangledName << ":\n";
 }
 
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index d269ee2..c7a3d81 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -834,6 +834,8 @@
 }
 
 void updateStats(Cfg *Func, const Inst *I) {
+  if (!ALLOW_DUMP)
+    return;
   // Update emitted instruction count, plus fill/spill count for
   // Variable operands without a physical register.
   if (uint32_t Count = I->getEmitInstCount()) {
@@ -891,9 +893,127 @@
     emitRegisterUsage(Str, Func, this, false, LiveRegCount);
 }
 
+// Helper class for emitIAS().
+namespace {
+class BundleEmitHelper {
+  BundleEmitHelper() = delete;
+  BundleEmitHelper(const BundleEmitHelper &) = delete;
+  BundleEmitHelper &operator=(const BundleEmitHelper &) = delete;
+
+public:
+  BundleEmitHelper(Assembler *Asm, const InstList &Insts)
+      : Asm(Asm), End(Insts.end()), BundleLockStart(End),
+        BundleSize(1 << Asm->getBundleAlignLog2Bytes()),
+        BundleMaskLo(BundleSize - 1), BundleMaskHi(~BundleMaskLo),
+        SizeSnapshotPre(0), SizeSnapshotPost(0) {}
+  // Check whether we're currently within a bundle_lock region.
+  bool isInBundleLockRegion() const { return BundleLockStart != End; }
+  // Check whether the current bundle_lock region has the align_to_end
+  // option.
+  bool isAlignToEnd() const {
+    assert(isInBundleLockRegion());
+    return llvm::cast<InstBundleLock>(getBundleLockStart())->getOption() ==
+           InstBundleLock::Opt_AlignToEnd;
+  }
+  // Check whether the entire bundle_lock region falls within the same
+  // bundle.
+  bool isSameBundle() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre == SizeSnapshotPost ||
+           (SizeSnapshotPre & BundleMaskHi) ==
+               ((SizeSnapshotPost - 1) & BundleMaskHi);
+  }
+  // Get the bundle alignment of the first instruction of the
+  // bundle_lock region.
+  intptr_t getPreAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPre & BundleMaskLo;
+  }
+  // Get the bundle alignment of the first instruction past the
+  // bundle_lock region.
+  intptr_t getPostAlignment() const {
+    assert(isInBundleLockRegion());
+    return SizeSnapshotPost & BundleMaskLo;
+  }
+  // Get the iterator pointing to the bundle_lock instruction, e.g. to
+  // roll back the instruction iteration to that point.
+  InstList::const_iterator getBundleLockStart() const {
+    assert(isInBundleLockRegion());
+    return BundleLockStart;
+  }
+  // Set up bookkeeping when the bundle_lock instruction is first
+  // processed.
+  void enterBundleLock(InstList::const_iterator I) {
+    assert(!isInBundleLockRegion());
+    BundleLockStart = I;
+    SizeSnapshotPre = Asm->getBufferSize();
+    Asm->setPreliminary(true);
+    assert(isInBundleLockRegion());
+  }
+  // Update bookkeeping when the bundle_unlock instruction is
+  // processed.
+  void enterBundleUnlock() {
+    assert(isInBundleLockRegion());
+    SizeSnapshotPost = Asm->getBufferSize();
+  }
+  // Update bookkeeping when we are completely finished with the
+  // bundle_lock region.
+  void leaveBundleLockRegion() { BundleLockStart = End; }
+  // Check whether the instruction sequence fits within the current
+  // bundle, and if not, add nop padding to the end of the current
+  // bundle.
+  void padToNextBundle() {
+    assert(isInBundleLockRegion());
+    if (!isSameBundle()) {
+      intptr_t PadToNextBundle = BundleSize - getPreAlignment();
+      Asm->padWithNop(PadToNextBundle);
+      SizeSnapshotPre += PadToNextBundle;
+      SizeSnapshotPost += PadToNextBundle;
+      assert((Asm->getBufferSize() & BundleMaskLo) == 0);
+      assert(Asm->getBufferSize() == SizeSnapshotPre);
+    }
+  }
+  // If align_to_end is specified, add padding such that the
+  // instruction sequences ends precisely at a bundle boundary.
+  void padForAlignToEnd() {
+    assert(isInBundleLockRegion());
+    if (isAlignToEnd()) {
+      if (intptr_t Offset = getPostAlignment()) {
+        Asm->padWithNop(BundleSize - Offset);
+        SizeSnapshotPre = Asm->getBufferSize();
+      }
+    }
+  }
+  // Update bookkeeping when rolling back for the second pass.
+  void rollback() {
+    assert(isInBundleLockRegion());
+    Asm->setBufferSize(SizeSnapshotPre);
+    Asm->setPreliminary(false);
+  }
+
+private:
+  Assembler *const Asm;
+  // End is a sentinel value such that BundleLockStart==End implies
+  // that we are not in a bundle_lock region.
+  const InstList::const_iterator End;
+  InstList::const_iterator BundleLockStart;
+  const intptr_t BundleSize;
+  // Masking with BundleMaskLo identifies an address's bundle offset.
+  const intptr_t BundleMaskLo;
+  // Masking with BundleMaskHi identifies an address's bundle.
+  const intptr_t BundleMaskHi;
+  intptr_t SizeSnapshotPre;
+  intptr_t SizeSnapshotPost;
+};
+
+} // end of anonymous namespace
+
 void CfgNode::emitIAS(Cfg *Func) const {
   Func->setCurrentNode(this);
   Assembler *Asm = Func->getAssembler<>();
+  // TODO(stichnot): When sandboxing, defer binding the node label
+  // until just before the first instruction is emitted, to reduce the
+  // chance that a padding nop is a branch target.
   Asm->BindCfgNodeLabel(getIndex());
   for (const Inst &I : Phis) {
     if (I.isDeleted())
@@ -901,14 +1021,99 @@
     // Emitting a Phi instruction should cause an error.
     I.emitIAS(Func);
   }
-  for (const Inst &I : Insts) {
-    if (I.isDeleted())
-      continue;
-    if (I.isRedundantAssign())
-      continue;
-    I.emitIAS(Func);
-    updateStats(Func, &I);
+
+  // Do the simple emission if not sandboxed.
+  if (!Func->getContext()->getFlags().getUseSandboxing()) {
+    for (const Inst &I : Insts) {
+      if (!I.isDeleted() && !I.isRedundantAssign()) {
+        I.emitIAS(Func);
+        updateStats(Func, &I);
+      }
+    }
+    return;
   }
+
+  // The remainder of the function handles emission with sandboxing.
+  // There are explicit bundle_lock regions delimited by bundle_lock
+  // and bundle_unlock instructions.  All other instructions are
+  // treated as an implicit one-instruction bundle_lock region.
+  // Emission is done twice for each bundle_lock region.  The first
+  // pass is a preliminary pass, after which we can figure out what
+  // nop padding is needed, then roll back, and make the final pass.
+  //
+  // Ideally, the first pass would be speculative and the second pass
+  // would only be done if nop padding were needed, but the structure
+  // of the integrated assembler makes it hard to roll back the state
+  // of label bindings, label links, and relocation fixups.  Instead,
+  // the first pass just disables all mutation of that state.
+
+  BundleEmitHelper Helper(Asm, Insts);
+  InstList::const_iterator End = Insts.end();
+  // Retrying indicates that we had to roll back to the bundle_lock
+  // instruction to apply padding before the bundle_lock sequence.
+  bool Retrying = false;
+  for (InstList::const_iterator I = Insts.begin(); I != End; ++I) {
+    if (I->isDeleted() || I->isRedundantAssign())
+      continue;
+
+    if (llvm::isa<InstBundleLock>(I)) {
+      // Set up the initial bundle_lock state.  This should not happen
+      // while retrying, because the retry rolls back to the
+      // instruction following the bundle_lock instruction.
+      assert(!Retrying);
+      Helper.enterBundleLock(I);
+      continue;
+    }
+
+    if (llvm::isa<InstBundleUnlock>(I)) {
+      Helper.enterBundleUnlock();
+      if (Retrying) {
+        // Make sure all instructions are in the same bundle.
+        assert(Helper.isSameBundle());
+        // If align_to_end is specified, make sure the next
+        // instruction begins the bundle.
+        assert(!Helper.isAlignToEnd() || Helper.getPostAlignment() == 0);
+        Helper.leaveBundleLockRegion();
+        Retrying = false;
+      } else {
+        // This is the first pass, so roll back for the retry pass.
+        Helper.rollback();
+        // Pad to the next bundle if the instruction sequence crossed
+        // a bundle boundary.
+        Helper.padToNextBundle();
+        // Insert additional padding to make AlignToEnd work.
+        Helper.padForAlignToEnd();
+        // Prepare for the retry pass after padding is done.
+        Retrying = true;
+        I = Helper.getBundleLockStart();
+      }
+      continue;
+    }
+
+    // I points to a non bundle_lock/bundle_unlock instruction.
+    if (Helper.isInBundleLockRegion()) {
+      I->emitIAS(Func);
+      // Only update stats during the final pass.
+      if (Retrying)
+        updateStats(Func, I);
+    } else {
+      // Treat it as though there were an implicit bundle_lock and
+      // bundle_unlock wrapping the instruction.
+      Helper.enterBundleLock(I);
+      I->emitIAS(Func);
+      Helper.enterBundleUnlock();
+      Helper.rollback();
+      Helper.padToNextBundle();
+      I->emitIAS(Func);
+      updateStats(Func, I);
+      Helper.leaveBundleLockRegion();
+    }
+  }
+
+  // Don't allow bundle locking across basic blocks, to keep the
+  // backtracking mechanism simple.
+  assert(!Helper.isInBundleLockRegion());
+  assert(!Retrying);
 }
 
 void CfgNode::dump(Cfg *Func) const {
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 3327455..ce5bde0 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -436,6 +436,13 @@
 InstUnreachable::InstUnreachable(Cfg *Func)
     : InstHighLevel(Func, Inst::Unreachable, 0, nullptr) {}
 
+InstBundleLock::InstBundleLock(Cfg *Func, InstBundleLock::Option BundleOption)
+    : InstHighLevel(Func, Inst::BundleLock, 0, nullptr),
+      BundleOption(BundleOption) {}
+
+InstBundleUnlock::InstBundleUnlock(Cfg *Func)
+    : InstHighLevel(Func, Inst::BundleUnlock, 0, nullptr) {}
+
 InstFakeDef::InstFakeDef(Cfg *Func, Variable *Dest, Variable *Src)
     : InstHighLevel(Func, Inst::FakeDef, Src ? 1 : 0, Dest) {
   assert(Dest);
@@ -774,6 +781,48 @@
   Str << "unreachable";
 }
 
+void InstBundleLock::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t.bundle_lock";
+  switch (BundleOption) {
+  case Opt_None:
+    break;
+  case Opt_AlignToEnd:
+    Str << "\talign_to_end";
+    break;
+  }
+}
+
+void InstBundleLock::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "bundle_lock";
+  switch (BundleOption) {
+  case Opt_None:
+    break;
+  case Opt_AlignToEnd:
+    Str << " align_to_end";
+    break;
+  }
+}
+
+void InstBundleUnlock::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t.bundle_unlock";
+}
+
+void InstBundleUnlock::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "bundle_unlock";
+}
+
 void InstFakeDef::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
diff --git a/src/IceInst.h b/src/IceInst.h
index 2d6d054..72422c9 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -59,11 +59,13 @@
     Select,
     Store,
     Switch,
-    FakeDef,  // not part of LLVM/PNaCl bitcode
-    FakeUse,  // not part of LLVM/PNaCl bitcode
-    FakeKill, // not part of LLVM/PNaCl bitcode
-    Target    // target-specific low-level ICE
-              // Anything >= Target is an InstTarget subclass.
+    BundleLock,   // not part of LLVM/PNaCl bitcode
+    BundleUnlock, // not part of LLVM/PNaCl bitcode
+    FakeDef,      // not part of LLVM/PNaCl bitcode
+    FakeUse,      // not part of LLVM/PNaCl bitcode
+    FakeKill,     // not part of LLVM/PNaCl bitcode
+    Target        // target-specific low-level ICE
+                  // Anything >= Target is an InstTarget subclass.
   };
   InstKind getKind() const { return Kind; }
 
@@ -743,6 +745,53 @@
   ~InstUnreachable() override {}
 };
 
+// BundleLock instruction.  There are no operands.  Contains an option
+// indicating whether align_to_end is specified.
+class InstBundleLock : public InstHighLevel {
+  InstBundleLock(const InstBundleLock &) = delete;
+  InstBundleLock &operator=(const InstBundleLock &) = delete;
+
+public:
+  enum Option { Opt_None, Opt_AlignToEnd };
+  static InstBundleLock *create(Cfg *Func, Option BundleOption) {
+    return new (Func->allocate<InstBundleLock>())
+        InstBundleLock(Func, BundleOption);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg * /* Func */) const override {}
+  void dump(const Cfg *Func) const override;
+  Option getOption() const { return BundleOption; }
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == BundleLock;
+  }
+
+private:
+  Option BundleOption;
+  InstBundleLock(Cfg *Func, Option BundleOption);
+  ~InstBundleLock() override {}
+};
+
+// BundleUnlock instruction.  There are no operands.
+class InstBundleUnlock : public InstHighLevel {
+  InstBundleUnlock(const InstBundleUnlock &) = delete;
+  InstBundleUnlock &operator=(const InstBundleUnlock &) = delete;
+
+public:
+  static InstBundleUnlock *create(Cfg *Func) {
+    return new (Func->allocate<InstBundleUnlock>()) InstBundleUnlock(Func);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg * /* Func */) const override {}
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == BundleUnlock;
+  }
+
+private:
+  explicit InstBundleUnlock(Cfg *Func);
+  ~InstBundleUnlock() override {}
+};
+
 // FakeDef instruction.  This creates a fake definition of a variable,
 // which is how we represent the case when an instruction produces
 // multiple results.  This doesn't happen with high-level ICE
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 30edb9e..815b617 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -200,6 +200,11 @@
   return false;
 }
 
+InstX8632Jmp::InstX8632Jmp(Cfg *Func, Operand *Target)
+    : InstX8632(Func, InstX8632::Jmp, 1, nullptr) {
+  addSource(Target);
+}
+
 InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
     : InstX8632(Func, InstX8632::Call, 1, Dest) {
   HasSideEffects = true;
@@ -450,6 +455,59 @@
   }
 }
 
+void InstX8632Jmp::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Str << "\tjmp\t*";
+  getJmpTarget()->emit(Func);
+}
+
+void InstX8632Jmp::emitIAS(const Cfg *Func) const {
+  // Note: Adapted (mostly copied) from InstX8632Call::emitIAS().
+  x86::AssemblerX86 *Asm = Func->getAssembler<x86::AssemblerX86>();
+  Operand *Target = getJmpTarget();
+  if (const auto Var = llvm::dyn_cast<Variable>(Target)) {
+    if (Var->hasReg()) {
+      Asm->jmp(RegX8632::getEncodedGPR(Var->getRegNum()));
+    } else {
+      // The jmp instruction with a memory operand should be possible
+      // to encode, but it isn't a valid sandboxed instruction, and
+      // there shouldn't be a register allocation issue to jump
+      // through a scratch register, so we don't really need to bother
+      // implementing it.
+      llvm::report_fatal_error("Assembler can't jmp to memory operand");
+    }
+  } else if (const auto Mem = llvm::dyn_cast<OperandX8632Mem>(Target)) {
+    assert(Mem->getSegmentRegister() == OperandX8632Mem::DefaultSegment);
+    llvm::report_fatal_error("Assembler can't jmp to memory operand");
+  } else if (const auto CR = llvm::dyn_cast<ConstantRelocatable>(Target)) {
+    assert(CR->getOffset() == 0 && "We only support jumping to a function");
+    Asm->jmp(CR);
+  } else if (const auto Imm = llvm::dyn_cast<ConstantInteger32>(Target)) {
+    // NaCl trampoline calls refer to an address within the sandbox directly.
+    // This is usually only needed for non-IRT builds and otherwise not
+    // very portable or stable. For this, we would use the 0xE8 opcode
+    // (relative version of call) and there should be a PC32 reloc too.
+    // The PC32 reloc will have symbol index 0, and the absolute address
+    // would be encoded as an offset relative to the next instruction.
+    // TODO(jvoung): Do we need to support this?
+    (void)Imm;
+    llvm::report_fatal_error("Unexpected jmp to absolute address");
+  } else {
+    llvm::report_fatal_error("Unexpected operand type");
+  }
+}
+
+void InstX8632Jmp::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "jmp ";
+  getJmpTarget()->dump(Func);
+}
+
 void InstX8632Call::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -2729,7 +2787,7 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   if (SegmentReg != DefaultSegment) {
     assert(SegmentReg >= 0 && SegmentReg < SegReg_NUM);
-    Str << InstX8632SegmentRegNames[SegmentReg] << ":";
+    Str << "%" << InstX8632SegmentRegNames[SegmentReg] << ":";
   }
   // Emit as Offset(Base,Index,1<<Shift).
   // Offset is emitted without the leading '$'.
@@ -2737,7 +2795,7 @@
   if (!Offset) {
     // No offset, emit nothing.
   } else if (const auto CI = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-    if (CI->getValue())
+    if (Base == nullptr || CI->getValue())
       // Emit a non-zero offset without a leading '$'.
       Str << CI->getValue();
   } else if (const auto CR = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 7725ad7..ed4924f 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -196,6 +196,7 @@
     Idiv,
     Imul,
     Insertps,
+    Jmp,
     Label,
     Lea,
     Load,
@@ -400,6 +401,27 @@
   const InstX8632Label *Label; // Intra-block branch target
 };
 
+// Jump to a target outside this function, such as tailcall, nacljump,
+// naclret, unreachable.  This is different from a Branch instruction
+// in that there is no intra-function control flow to represent.
+class InstX8632Jmp : public InstX8632 {
+  InstX8632Jmp(const InstX8632Jmp &) = delete;
+  InstX8632Jmp &operator=(const InstX8632Jmp &) = delete;
+
+public:
+  static InstX8632Jmp *create(Cfg *Func, Operand *Target) {
+    return new (Func->allocate<InstX8632Jmp>()) InstX8632Jmp(Func, Target);
+  }
+  Operand *getJmpTarget() const { return getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Jmp); }
+
+private:
+  InstX8632Jmp(Cfg *Func, Operand *Target);
+};
+
 // AdjustStack instruction - subtracts esp by the given amount and
 // updates the stack offset during code emission.
 class InstX8632AdjustStack : public InstX8632 {
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 8169ae1..461b086 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -218,6 +218,8 @@
   case Inst::Unreachable:
     lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst));
     break;
+  case Inst::BundleLock:
+  case Inst::BundleUnlock:
   case Inst::FakeDef:
   case Inst::FakeUse:
   case Inst::FakeKill:
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 0467320..1e4d282 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -963,6 +963,29 @@
       _pop(getPhysicalRegister(j));
     }
   }
+
+  if (!Ctx->getFlags().getUseSandboxing())
+    return;
+  // Change the original ret instruction into a sandboxed return sequence.
+  // t:ecx = pop
+  // bundle_lock
+  // and t, ~31
+  // jmp *t
+  // bundle_unlock
+  // FakeUse <original_ret_operand>
+  const SizeT BundleSize = 1
+                           << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+  Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
+  _pop(T_ecx);
+  _bundle_lock();
+  _and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
+  _jmp(T_ecx);
+  _bundle_unlock();
+  if (RI->getSrcSize()) {
+    Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
+    Context.insert(InstFakeUse::create(Func, RetValue));
+  }
+  RI->setDeleted();
 }
 
 void TargetX8632::split64(Variable *Var) {
@@ -1815,8 +1838,24 @@
     }
   }
   Operand *CallTarget = legalize(Instr->getCallTarget());
+  const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
+  if (NeedSandboxing) {
+    if (llvm::isa<Constant>(CallTarget)) {
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+    } else {
+      Variable *CallTargetVar = nullptr;
+      _mov(CallTargetVar, CallTarget);
+      _bundle_lock(InstBundleLock::Opt_AlignToEnd);
+      const SizeT BundleSize =
+          1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
+      _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
+      CallTarget = CallTargetVar;
+    }
+  }
   Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
   Context.insert(NewCall);
+  if (NeedSandboxing)
+    _bundle_unlock();
   if (ReturnRegHi)
     Context.insert(InstFakeDef::create(Func, ReturnRegHi));
 
@@ -3829,6 +3868,9 @@
       _mov(Reg, Src0, RegX8632::Reg_eax);
     }
   }
+  // Add a ret instruction even if sandboxing is enabled, because
+  // addEpilog explicitly looks for a ret instruction as a marker for
+  // where to insert the frame removal instructions.
   _ret(Reg);
   // Add a fake use of esp to make sure esp stays alive for the entire
   // function.  Otherwise post-call esp adjustments get dead-code
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 543183e..b376ec3 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -230,6 +230,11 @@
   void _bswap(Variable *SrcDest) {
     Context.insert(InstX8632Bswap::create(Func, SrcDest));
   }
+  void
+  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
+    Context.insert(InstBundleLock::create(Func, BundleOption));
+  }
+  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
   void _cbwdq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
   }
@@ -286,6 +291,9 @@
   void _insertps(Variable *Dest, Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Insertps::create(Func, Dest, Src0, Src1));
   }
+  void _jmp(Operand *Target) {
+    Context.insert(InstX8632Jmp::create(Func, Target));
+  }
   void _lea(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Lea::create(Func, Dest, Src0));
   }
diff --git a/src/assembler.cpp b/src/assembler.cpp
index b0ff419..d131426 100644
--- a/src/assembler.cpp
+++ b/src/assembler.cpp
@@ -40,7 +40,8 @@
   F->set_position(0);
   F->set_kind(Kind);
   F->set_value(Value);
-  fixups_.push_back(F);
+  if (!assembler_.getPreliminary())
+    fixups_.push_back(F);
   return F;
 }
 
diff --git a/src/assembler.h b/src/assembler.h
index dfd8cd1..84955e5 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -115,6 +115,11 @@
 
   const FixupRefList &fixups() const { return fixups_; }
 
+  void setSize(intptr_t NewSize) {
+    assert(NewSize <= Size());
+    cursor_ = contents_ + NewSize;
+  }
+
 private:
   // The limit is set to kMinimumGap bytes before the end of the data area.
   // This leaves enough space for the longest possible instruction and allows
@@ -149,7 +154,9 @@
   Assembler &operator=(const Assembler &) = delete;
 
 public:
-  Assembler() : FunctionName(""), IsInternal(false), buffer_(*this) {}
+  Assembler()
+      : FunctionName(""), IsInternal(false), Preliminary(false),
+        buffer_(*this) {}
   virtual ~Assembler() {}
 
   // Allocate a chunk of bytes using the per-Assembler allocator.
@@ -170,6 +177,9 @@
   // Align the tail end of the function to the required target alignment.
   virtual void alignFunction() = 0;
 
+  // Add nop padding of a particular width to the current bundle.
+  virtual void padWithNop(intptr_t Padding) = 0;
+
   virtual SizeT getBundleAlignLog2Bytes() const = 0;
 
   virtual llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const = 0;
@@ -194,6 +204,11 @@
   void setInternal(bool Internal) { IsInternal = Internal; }
   const IceString &getFunctionName() { return FunctionName; }
   void setFunctionName(const IceString &NewName) { FunctionName = NewName; }
+  intptr_t getBufferSize() const { return buffer_.Size(); }
+  // Roll back to a (smaller) size.
+  void setBufferSize(intptr_t NewSize) { buffer_.setSize(NewSize); }
+  void setPreliminary(bool Value) { Preliminary = Value; }
+  bool getPreliminary() const { return Preliminary; }
 
 private:
   ArenaAllocator<32 * 1024> Allocator;
@@ -202,6 +217,11 @@
   // assembler buffer is emitted.
   IceString FunctionName;
   bool IsInternal;
+  // Preliminary indicates whether a preliminary pass is being made
+  // for calculating bundle padding (Preliminary=true), versus the
+  // final pass where all changes to label bindings, label links, and
+  // relocation fixups are fully committed (Preliminary=false).
+  bool Preliminary;
 
 protected:
   AssemblerBuffer buffer_;
diff --git a/src/assembler_ia32.cpp b/src/assembler_ia32.cpp
index d4fff33..aac473c 100644
--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -79,13 +79,15 @@
 }
 
 void AssemblerX86::BindCfgNodeLabel(SizeT NodeNumber) {
+  assert(!getPreliminary());
   Label *L = GetOrCreateCfgNodeLabel(NodeNumber);
   this->Bind(L);
 }
 
 void AssemblerX86::BindLocalLabel(SizeT Number) {
   Label *L = GetOrCreateLocalLabel(Number);
-  this->Bind(L);
+  if (!getPreliminary())
+    this->Bind(L);
 }
 
 void AssemblerX86::call(GPRRegister reg) {
@@ -2229,6 +2231,13 @@
     intptr_t offset = label->Position() - buffer_.Size();
     assert(offset <= 0);
     if (Utils::IsInt(8, offset - kShortSize)) {
+      // TODO(stichnot): Here and in jmp(), we may need to be more
+      // conservative about the backward branch distance if the branch
+      // instruction is within a bundle_lock sequence, because the
+      // distance may increase when padding is added.  This isn't an
+      // issue for branches outside a bundle_lock, because if padding
+      // is added, the retry may change it to a long backward branch
+      // without affecting any of the bookkeeping.
       EmitUint8(0x70 + condition);
       EmitUint8((offset - kShortSize) & 0xFF);
     } else {
@@ -2463,14 +2472,16 @@
   assert(!label->IsBound());
   intptr_t position = buffer_.Size();
   EmitInt32(label->position_);
-  label->LinkTo(position);
+  if (!getPreliminary())
+    label->LinkTo(position);
 }
 
 void AssemblerX86::EmitNearLabelLink(Label *label) {
   assert(!label->IsBound());
   intptr_t position = buffer_.Size();
   EmitUint8(0);
-  label->NearLinkTo(position);
+  if (!getPreliminary())
+    label->NearLinkTo(position);
 }
 
 void AssemblerX86::EmitGenericShift(int rm, Type Ty, GPRRegister reg,
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 3e73574..06a601e 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -325,6 +325,8 @@
 
   intptr_t position_;
   intptr_t num_unresolved_;
+  // TODO(stichnot,jvoung): Can this instead be
+  // llvm::SmallVector<intptr_t, kMaxUnresolvedBranches> ?
   intptr_t unresolved_near_positions_[kMaxUnresolvedBranches];
 
   friend class AssemblerX86;
@@ -354,6 +356,15 @@
     return llvm::ArrayRef<uint8_t>(Padding, 1);
   }
 
+  void padWithNop(intptr_t Padding) override {
+    while (Padding > MAX_NOP_SIZE) {
+      nop(MAX_NOP_SIZE);
+      Padding -= MAX_NOP_SIZE;
+    }
+    if (Padding)
+      nop(Padding);
+  }
+
   Label *GetOrCreateCfgNodeLabel(SizeT NodeNumber);
   void BindCfgNodeLabel(SizeT NodeNumber) override;
   Label *GetOrCreateLocalLabel(SizeT Number);