Handle ARM "ret void" and function alignment with proper padding.

Modify run-pnacl-sz to pass in the correct assembler/disasembler flags
for ARM when not using the integrated assembler.

Model the "ret" pseudo instruction (special form of
"bx" inst). Separate from "bx" to allow epilogue
insertion to find the terminator.

Add a flag "--skip-unimplemented" to skip through all of the
"Not yet implemented" assertions, and use that in the test.

Set up a stack trace printer when ALLOW_DUMP so that the
UnimplementedError prints out some useful information of
*which* case is unimplemented.

Change the .type ...,@function from @function to %function.
ARM assembler seems to only like %function because
"@" is a comment character.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1136793002
diff --git a/Makefile.standalone b/Makefile.standalone
index 18065af..df4429c 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -177,6 +177,7 @@
 	IceGlobalContext.cpp \
 	IceGlobalInits.cpp \
 	IceInst.cpp \
+	IceInstARM32.cpp \
 	IceInstX8632.cpp \
 	IceIntrinsics.cpp \
 	IceLiveness.cpp \
diff --git a/pydir/run-pnacl-sz.py b/pydir/run-pnacl-sz.py
index 2d86a35..e15b096 100755
--- a/pydir/run-pnacl-sz.py
+++ b/pydir/run-pnacl-sz.py
@@ -10,6 +10,22 @@
 
 from utils import shellcmd
 
+
+def TargetAssemblerFlags(target):
+  # TODO(stichnot): -triple=i686-nacl should be used for a
+  # sandboxing test.  This means there should be an args.sandbox
+  # argument that also gets passed through to pnacl-sz.
+  flags = { 'x8632': ['-triple=i686'],
+            'arm32': ['-triple=armv7a', '-mcpu=cortex-a9', '-mattr=+neon'] }
+  return flags[target]
+
+
+def TargetDisassemblerFlags(target):
+  flags = { 'x8632': ['-Mintel'],
+            'arm32': [] }
+  return flags[target]
+
+
 def main():
     """Run the pnacl-sz compiler on an llvm file.
 
@@ -56,6 +72,9 @@
     argparser.add_argument('--filetype', default='iasm', dest='filetype',
                            choices=['obj', 'asm', 'iasm'],
                            help='Output file type.  Default %(default)s.')
+    argparser.add_argument('--target', default='x8632', dest='target',
+                           choices=['x8632','arm32'],
+                           help='Target architecture.  Default %(default)s.')
     argparser.add_argument('--echo-cmd', required=False,
                            action='store_true',
                            help='Trace command that generates ICE instructions')
@@ -82,6 +101,7 @@
         cmd += ['--allow-local-symbol-tables']
       cmd += ['|']
     cmd += [args.pnacl_sz]
+    cmd += ['--target', args.target]
     if args.insts:
       # If the tests are based on '-verbose inst' output, force
       # single-threaded translation because dump output does not get
@@ -107,19 +127,17 @@
       asm_temp = tempfile.NamedTemporaryFile(delete=False)
       asm_temp.close()
     if args.assemble and args.filetype != 'obj':
-      cmd += ['|', os.path.join(pnacl_bin_path, 'llvm-mc'),
-              # TODO(stichnot): -triple=i686-nacl should be used for a
-              # sandboxing test.  This means there should be an args.sandbox
-              # argument that also gets passed through to pnacl-sz.
-              '-triple=i686',
-              '-filetype=obj', '-o', asm_temp.name]
+      cmd += (['|', os.path.join(pnacl_bin_path, 'llvm-mc')] +
+              TargetAssemblerFlags(args.target) +
+              ['-filetype=obj', '-o', asm_temp.name])
     elif asm_temp:
       cmd += ['-o', asm_temp.name]
     if args.disassemble:
       # Show wide instruction encodings, diassemble, and show relocs.
       cmd += (['&&', os.path.join(pnacl_bin_path, 'le32-nacl-objdump')] +
               args.dis_flags +
-              ['-w', '-d', '-r', '-Mintel', asm_temp.name])
+              ['-w', '-d', '-r'] + TargetDisassemblerFlags(args.target) +
+              [asm_temp.name])
 
     stdout_result = shellcmd(cmd, echo=args.echo_cmd)
     if not args.echo_cmd:
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index e0f48ab..5b5c32b 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -464,9 +464,10 @@
     Str << "\t.section\t.text." << MangledName << ",\"ax\",@progbits\n";
   if (!Asm->getInternal() || Ctx->getFlags().getDisableInternal()) {
     Str << "\t.globl\t" << MangledName << "\n";
-    Str << "\t.type\t" << MangledName << ",@function\n";
+    Str << "\t.type\t" << MangledName << ",%function\n";
   }
-  Str << "\t.p2align " << Asm->getBundleAlignLog2Bytes() << ",0x";
+  Str << "\t" << Asm->getNonExecPadDirective() << " "
+      << Asm->getBundleAlignLog2Bytes() << ",0x";
   for (uint8_t I : Asm->getNonExecBundlePadding())
     Str.write_hex(I);
   Str << "\n";
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index fda4e06..d8ce728 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -111,6 +111,11 @@
                                 cl::desc("Randomize register allocation"),
                                 cl::init(false));
 
+cl::opt<bool> SkipUnimplemented(
+    "skip-unimplemented",
+    cl::desc("Skip through unimplemented lowering code instead of aborting."),
+    cl::init(false));
+
 cl::opt<bool> SubzeroTimingEnabled(
     "timing", cl::desc("Enable breakdown timing of Subzero translation"));
 
@@ -260,6 +265,7 @@
   OutFlags.PhiEdgeSplit = false;
   OutFlags.RandomNopInsertion = false;
   OutFlags.RandomRegAlloc = false;
+  OutFlags.SkipUnimplemented = false;
   OutFlags.SubzeroTimingEnabled = false;
   OutFlags.TimeEachFunction = false;
   OutFlags.UseSandboxing = false;
@@ -311,6 +317,7 @@
   OutFlags.setRandomSeed(::RandomSeed);
   OutFlags.setShouldDoNopInsertion(::ShouldDoNopInsertion);
   OutFlags.setShouldRandomizeRegAlloc(::RandomizeRegisterAllocation);
+  OutFlags.setSkipUnimplemented(::SkipUnimplemented);
   OutFlags.setSubzeroTimingEnabled(::SubzeroTimingEnabled);
   OutFlags.setTargetArch(::TargetArch);
   OutFlags.setTargetInstructionSet(::TargetInstructionSet);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index c89e695..517c77f 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -86,6 +86,9 @@
   bool shouldRandomizeRegAlloc() const { return RandomRegAlloc; }
   void setShouldRandomizeRegAlloc(bool NewValue) { RandomRegAlloc = NewValue; }
 
+  bool getSkipUnimplemented() const { return SkipUnimplemented; }
+  void setSkipUnimplemented(bool NewValue) { SkipUnimplemented = NewValue; }
+
   bool getSubzeroTimingEnabled() const { return SubzeroTimingEnabled; }
   void setSubzeroTimingEnabled(bool NewValue) {
     SubzeroTimingEnabled = NewValue;
@@ -184,6 +187,7 @@
   bool PhiEdgeSplit;
   bool RandomNopInsertion;
   bool RandomRegAlloc;
+  bool SkipUnimplemented;
   bool SubzeroTimingEnabled;
   bool TimeEachFunction;
   bool UseSandboxing;
diff --git a/src/IceCompileServer.cpp b/src/IceCompileServer.cpp
index 25b8092..d4048e9 100644
--- a/src/IceCompileServer.cpp
+++ b/src/IceCompileServer.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/StreamingMemoryObject.h"
 
@@ -49,6 +50,9 @@
 } // end of anonymous namespace
 
 void CLCompileServer::run() {
+  if (ALLOW_DUMP) {
+    llvm::sys::PrintStackTraceOnErrorSignal();
+  }
   ClFlags::parseFlags(argc, argv);
   ClFlags Flags;
   ClFlagsExtra ExtraFlags;
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
new file mode 100644
index 0000000..9f54e85
--- /dev/null
+++ b/src/IceInstARM32.cpp
@@ -0,0 +1,101 @@
+//===- subzero/src/IceInstARM32.cpp - ARM32 instruction implementation ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InstARM32 and OperandARM32 classes,
+// primarily the constructors and the dump()/emit() methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "assembler_arm32.h"
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceInst.h"
+#include "IceInstARM32.h"
+#include "IceOperand.h"
+#include "IceRegistersARM32.h"
+#include "IceTargetLoweringARM32.h"
+
+namespace Ice {
+
+namespace {
+
+const struct TypeARM32Attributes_ {
+  const char *WidthString; // b, h, <blank>, or d
+  int8_t SExtAddrOffsetBits;
+  int8_t ZExtAddrOffsetBits;
+} TypeARM32Attributes[] = {
+#define X(tag, elementty, width, sbits, ubits)                                 \
+  { width, sbits, ubits }                                                      \
+  ,
+    ICETYPEARM32_TABLE
+#undef X
+};
+
+} // end of anonymous namespace
+
+const char *InstARM32::getWidthString(Type Ty) {
+  return TypeARM32Attributes[Ty].WidthString;
+}
+
+bool OperandARM32Mem::canHoldOffset(Type Ty, bool SignExt, int32_t Offset) {
+  int32_t Bits = SignExt ? TypeARM32Attributes[Ty].SExtAddrOffsetBits
+                         : TypeARM32Attributes[Ty].ZExtAddrOffsetBits;
+  if (Bits == 0)
+    return Offset == 0;
+  // Note that encodings for offsets are sign-magnitude for ARM, so we check
+  // with IsAbsoluteUint().
+  if (isScalarFloatingType(Ty))
+    return Utils::IsAligned(Offset, 4) && Utils::IsAbsoluteUint(Bits, Offset);
+  return Utils::IsAbsoluteUint(Bits, Offset);
+}
+
+InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
+    : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
+  addSource(LR);
+  if (Source)
+    addSource(Source);
+}
+
+// ======================== Dump routines ======================== //
+
+void InstARM32::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "[ARM32] ";
+  Inst::dump(Func);
+}
+
+void InstARM32Ret::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(getSrcSize() > 0);
+  Variable *LR = llvm::cast<Variable>(getSrc(0));
+  assert(LR->hasReg());
+  assert(LR->getRegNum() == RegARM32::Reg_lr);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\tbx\t";
+  LR->emit(Func);
+}
+
+void InstARM32Ret::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Ret::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = (getSrcSize() == 1 ? IceType_void : getSrc(0)->getType());
+  Str << "ret." << Ty << " ";
+  dumpSources(Func);
+}
+
+} // end of namespace Ice
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index ea7032a..8eca14b 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -61,5 +61,24 @@
 
 // TODO(jvoung): add condition code tables, etc.
 
+// Load/Store instruction width suffixes.
+#define ICETYPEARM32_TABLE                                              \
+  /* tag,          element type, width, addr off bits sext, zext */     \
+  X(IceType_void,  IceType_void, "",  0, 0)                             \
+  X(IceType_i1,    IceType_void, "b", 8, 12)                            \
+  X(IceType_i8,    IceType_void, "b", 8, 12)                            \
+  X(IceType_i16,   IceType_void, "h", 8, 8)                             \
+  X(IceType_i32,   IceType_void, "", 12, 12)                            \
+  X(IceType_i64,   IceType_void, "d", 8, 8)                             \
+  X(IceType_f32,   IceType_void, "", 10, 10)                            \
+  X(IceType_f64,   IceType_void, "", 10, 10)                            \
+  X(IceType_v4i1,  IceType_i32 , "",  0,  0)                            \
+  X(IceType_v8i1,  IceType_i16 , "",  0,  0)                            \
+  X(IceType_v16i1, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v16i8, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v8i16, IceType_i16 , "",  0,  0)                            \
+  X(IceType_v4i32, IceType_i32 , "",  0,  0)                            \
+  X(IceType_v4f32, IceType_f32 , "",  0,  0)                            \
+//#define X(tag, elementty, width, sbits, ubits)
 
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 1c7d346..e4e6c49 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -17,11 +17,104 @@
 #define SUBZERO_SRC_ICEINSTARM32_H
 
 #include "IceDefs.h"
+#include "IceInst.h"
+#include "IceInstARM32.def"
+#include "IceOperand.h"
 
 namespace Ice {
 
 class TargetARM32;
-// Fill this in.
+
+// OperandARM32 extends the Operand hierarchy.
+// TODO(jvoung): Add the OperandARM32Mem and OperandARM32Flex.
+class OperandARM32 : public Operand {
+  OperandARM32() = delete;
+  OperandARM32(const OperandARM32 &) = delete;
+  OperandARM32 &operator=(const OperandARM32 &) = delete;
+
+public:
+  enum OperandKindARM32 { k__Start = Operand::kTarget };
+
+  enum ShiftKind {
+    kNoShift = -1,
+#define X(enum, emit) enum,
+    ICEINSTARM32SHIFT_TABLE
+#undef X
+  };
+
+  using Operand::dump;
+  void dump(const Cfg *, Ostream &Str) const override {
+    if (ALLOW_DUMP)
+      Str << "<OperandARM32>";
+  }
+
+protected:
+  OperandARM32(OperandKindARM32 Kind, Type Ty)
+      : Operand(static_cast<OperandKind>(Kind), Ty) {}
+  ~OperandARM32() override {}
+};
+
+// OperandARM32Mem represents a memory operand in any of the various ARM32
+// addressing modes.
+// TODO(jvoung): Fill out more.
+class OperandARM32Mem : public OperandARM32 {
+  OperandARM32Mem() = delete;
+  OperandARM32Mem(const OperandARM32Mem &) = delete;
+  OperandARM32Mem &operator=(const OperandARM32Mem &) = delete;
+
+public:
+  // Return true if a load/store instruction for an element of type Ty
+  // can encode the Offset directly in the immediate field of the 32-bit
+  // ARM instruction. For some types, if the load is Sign extending, then
+  // the range is reduced.
+  static bool canHoldOffset(Type Ty, bool SignExt, int32_t Offset);
+};
+
+class InstARM32 : public InstTarget {
+  InstARM32() = delete;
+  InstARM32(const InstARM32 &) = delete;
+  InstARM32 &operator=(const InstARM32 &) = delete;
+
+public:
+  enum InstKindARM32 { k__Start = Inst::Target, Ret };
+
+  static const char *getWidthString(Type Ty);
+
+  void dump(const Cfg *Func) const override;
+
+protected:
+  InstARM32(Cfg *Func, InstKindARM32 Kind, SizeT Maxsrcs, Variable *Dest)
+      : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
+  ~InstARM32() override {}
+  static bool isClassof(const Inst *Inst, InstKindARM32 MyKind) {
+    return Inst->getKind() == static_cast<InstKind>(MyKind);
+  }
+};
+
+// Ret pseudo-instruction.  This is actually a "bx" instruction with
+// an "lr" register operand, but epilogue lowering will search for a Ret
+// instead of a generic "bx". This instruction also takes a Source
+// operand (for non-void returning functions) for liveness analysis, though
+// a FakeUse before the ret would do just as well.
+class InstARM32Ret : public InstARM32 {
+  InstARM32Ret() = delete;
+  InstARM32Ret(const InstARM32Ret &) = delete;
+  InstARM32Ret &operator=(const InstARM32Ret &) = delete;
+
+public:
+  static InstARM32Ret *create(Cfg *Func, Variable *LR,
+                              Variable *Source = nullptr) {
+    return new (Func->allocate<InstARM32Ret>()) InstARM32Ret(Func, LR, Source);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Ret); }
+
+private:
+  InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source);
+  ~InstARM32Ret() override {}
+};
 
 } // end of namespace Ice
 
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 6691e1e..287e42d 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -30,6 +30,17 @@
 
 namespace Ice {
 
+namespace {
+void UnimplementedError(const ClFlags &Flags) {
+  if (!Flags.getSkipUnimplemented()) {
+    // Use llvm_unreachable instead of report_fatal_error, which gives better
+    // stack traces.
+    llvm_unreachable("Not yet implemented");
+    abort();
+  }
+}
+} // end of anonymous namespace
+
 TargetARM32::TargetARM32(Cfg *Func)
     : TargetLowering(Func), UsesFramePointer(false) {
   // TODO: Don't initialize IntegerRegisters and friends every time.
@@ -205,7 +216,8 @@
 bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
   (void)I;
   (void)NextNode;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
+  return false;
 }
 
 IceString TargetARM32::RegNames[] = {
@@ -233,9 +245,9 @@
     Reg = Func->makeVariable(Ty);
     Reg->setRegNum(RegNum);
     PhysicalRegisters[Ty][RegNum] = Reg;
-    // Specially mark SP as an "argument" so that it is considered
+    // Specially mark SP and LR as an "argument" so that it is considered
     // live upon function entry.
-    if (RegNum == RegARM32::Reg_sp) {
+    if (RegNum == RegARM32::Reg_sp || RegNum == RegARM32::Reg_lr) {
       Func->addImplicitArg(Reg);
       Reg->setIgnoreLiveness();
     }
@@ -245,25 +257,42 @@
 
 void TargetARM32::emitVariable(const Variable *Var) const {
   Ostream &Str = Ctx->getStrEmit();
-  (void)Var;
-  (void)Str;
-  llvm::report_fatal_error("emitVariable: Not yet implemented");
+  if (Var->hasReg()) {
+    Str << getRegName(Var->getRegNum(), Var->getType());
+    return;
+  }
+  if (Var->getWeight().isInf()) {
+    llvm::report_fatal_error(
+        "Infinite-weight Variable has no register assigned");
+  }
+  int32_t Offset = Var->getStackOffset();
+  if (!hasFramePointer())
+    Offset += getStackAdjustment();
+  // TODO(jvoung): Handle out of range. Perhaps we need a scratch register
+  // to materialize a larger offset.
+  const bool SignExt = false;
+  if (!OperandARM32Mem::canHoldOffset(Var->getType(), SignExt, Offset)) {
+    llvm::report_fatal_error("Illegal stack offset");
+  }
+  const Type FrameSPTy = IceType_i32;
+  Str << "[" << getRegName(getFrameOrStackReg(), FrameSPTy) << ", " << Offset
+      << "]";
 }
 
 void TargetARM32::lowerArguments() {
-  llvm::report_fatal_error("lowerArguments: Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 Type TargetARM32::stackSlotType() { return IceType_i32; }
 
 void TargetARM32::addProlog(CfgNode *Node) {
   (void)Node;
-  llvm::report_fatal_error("addProlog: Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::addEpilog(CfgNode *Node) {
   (void)Node;
-  llvm::report_fatal_error("addEpilog: Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
@@ -305,7 +334,7 @@
   // restriction can be relaxed in some cases.
   NeedsStackAlignment = true;
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
@@ -314,75 +343,75 @@
     llvm_unreachable("Unknown arithmetic operator");
     break;
   case InstArithmetic::Add:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::And:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Or:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Xor:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Sub:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Mul:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Shl:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Lshr:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Ashr:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Udiv:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Sdiv:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Urem:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Srem:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Fadd:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Fsub:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Fmul:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Fdiv:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstArithmetic::Frem:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
 }
 
 void TargetARM32::lowerAssign(const InstAssign *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerBr(const InstBr *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerCall(const InstCall *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerCast(const InstCast *Inst) {
@@ -392,39 +421,39 @@
     Func->setError("Cast type not supported");
     return;
   case InstCast::Sext: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   case InstCast::Zext: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   case InstCast::Trunc: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   case InstCast::Fptrunc:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstCast::Fpext: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   case InstCast::Fptosi:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstCast::Fptoui:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstCast::Sitofp:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   case InstCast::Uitofp: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   case InstCast::Bitcast: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     break;
   }
   }
@@ -432,72 +461,72 @@
 
 void TargetARM32::lowerExtractElement(const InstExtractElement *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerFcmp(const InstFcmp *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerInsertElement(const InstInsertElement *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
   case Intrinsics::AtomicCmpxchg: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::AtomicFence:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   case Intrinsics::AtomicFenceAll:
     // NOTE: FenceAll should prevent and load/store from being moved
     // across the fence (both atomic and non-atomic). The InstARM32Mfence
     // instruction is currently marked coarsely as "HasSideEffects".
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   case Intrinsics::AtomicIsLockFree: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::AtomicLoad: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::AtomicRMW:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   case Intrinsics::AtomicStore: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Bswap: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Ctpop: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Ctlz: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Cttz: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Fabs: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Longjmp: {
@@ -542,7 +571,7 @@
   }
   case Intrinsics::NaClReadTP: {
     if (Ctx->getFlags().getUseSandboxing()) {
-      llvm::report_fatal_error("Not yet implemented");
+      UnimplementedError(Func->getContext()->getFlags());
     } else {
       InstCall *Call = makeHelperCall(H_call_read_tp, Instr->getDest(), 0);
       lowerCall(Call);
@@ -556,19 +585,19 @@
     return;
   }
   case Intrinsics::Sqrt: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Stacksave: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Stackrestore: {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   case Intrinsics::Trap:
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
     return;
   case Intrinsics::UnknownIntrinsic:
     Func->setError("Should not be lowering UnknownIntrinsic");
@@ -579,17 +608,17 @@
 
 void TargetARM32::lowerLoad(const InstLoad *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::doAddressOptLoad() {
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::randomlyInsertNop(float Probability) {
   RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
   if (RNG.getTrueWithProbability(Probability)) {
-    llvm::report_fatal_error("Not yet implemented");
+    UnimplementedError(Func->getContext()->getFlags());
   }
 }
 
@@ -598,27 +627,42 @@
 }
 
 void TargetARM32::lowerRet(const InstRet *Inst) {
-  (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  Variable *Reg = nullptr;
+  if (Inst->hasRetValue()) {
+    UnimplementedError(Func->getContext()->getFlags());
+  }
+  // Add a ret instruction even if sandboxing is enabled, because
+  // addEpilog explicitly looks for a ret instruction as a marker for
+  // where to insert the frame removal instructions.
+  // addEpilog is responsible for restoring the "lr" register as needed
+  // prior to this ret instruction.
+  _ret(getPhysicalRegister(RegARM32::Reg_lr), Reg);
+  // Add a fake use of sp to make sure sp stays alive for the entire
+  // function.  Otherwise post-call sp adjustments get dead-code
+  // eliminated.  TODO: Are there more places where the fake use
+  // should be inserted?  E.g. "void f(int n){while(1) g(n);}" may not
+  // have a ret instruction.
+  Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp);
+  Context.insert(InstFakeUse::create(Func, SP));
 }
 
 void TargetARM32::lowerSelect(const InstSelect *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerStore(const InstStore *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::doAddressOptStore() {
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerSwitch(const InstSwitch *Inst) {
   (void)Inst;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::lowerUnreachable(const InstUnreachable * /*Inst*/) {
@@ -630,7 +674,7 @@
 // turned into zeroes, since loOperand() and hiOperand() don't expect
 // Undef input.
 void TargetARM32::prelowerPhis() {
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 // Lower the pre-ordered list of assignments into mov instructions.
@@ -639,7 +683,7 @@
                                       const AssignList &Assignments) {
   (void)Node;
   (void)Assignments;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::postLower() {
@@ -647,7 +691,7 @@
     return;
   // Find two-address non-SSA instructions where Dest==Src0, and set
   // the DestNonKillable flag to keep liveness analysis consistent.
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 void TargetARM32::makeRandomRegisterPermutation(
@@ -655,7 +699,7 @@
     const llvm::SmallBitVector &ExcludeRegisters) const {
   (void)Permutation;
   (void)ExcludeRegisters;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 /* TODO(jvoung): avoid duplicate symbols with multiple targets.
@@ -673,7 +717,7 @@
 
 void TargetDataARM32::lowerGlobal(const VariableDeclaration &Var) const {
   (void)Var;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Ctx->getFlags());
 }
 
 void TargetDataARM32::lowerGlobals(
@@ -699,7 +743,7 @@
 void TargetDataARM32::lowerConstants() const {
   if (Ctx->getFlags().getDisableTranslation())
     return;
-  llvm::report_fatal_error("Not yet implemented");
+  UnimplementedError(Ctx->getFlags());
 }
 
 } // end of namespace Ice
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index e973652..c1862d5 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -16,6 +16,7 @@
 #define SUBZERO_SRC_ICETARGETLOWERINGARM32_H
 
 #include "IceDefs.h"
+#include "IceInstARM32.h"
 #include "IceRegistersARM32.h"
 #include "IceTargetLowering.h"
 
@@ -91,6 +92,14 @@
 
   static Type stackSlotType();
 
+  // The following are helpers that insert lowered ARM32 instructions
+  // with minimal syntactic overhead, so that the lowering code can
+  // look as close to assembly as practical.
+
+  void _ret(Variable *LR, Variable *Src0 = nullptr) {
+    Context.insert(InstARM32Ret::create(Func, LR, Src0));
+  }
+
   bool UsesFramePointer;
   bool NeedsStackAlignment;
   llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6178d45..dd75168 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -509,13 +509,13 @@
   }
   if (Var->getWeight().isInf())
     llvm_unreachable("Infinite-weight Variable has no register assigned");
-  const Type Ty = IceType_i32;
   int32_t Offset = Var->getStackOffset();
   if (!hasFramePointer())
     Offset += getStackAdjustment();
   if (Offset)
     Str << Offset;
-  Str << "(%" << getRegName(getFrameOrStackReg(), Ty) << ")";
+  const Type FrameSPTy = IceType_i32;
+  Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
 }
 
 X8632::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
diff --git a/src/IceUtils.h b/src/IceUtils.h
index 1a7a8df..bcbba23 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -51,11 +51,26 @@
     return (0 <= value) && (value < limit);
   }
 
+  // Check whether the magnitude of value fits in N bits, i.e., whether an
+  // (N+1)-bit sign-magnitude representation can hold value.
+  template <typename T> static inline bool IsAbsoluteUint(int N, T Value) {
+    assert((0 < N) &&
+           (static_cast<unsigned int>(N) < (CHAR_BIT * sizeof(Value))));
+    if (Value < 0)
+      Value = -Value;
+    return IsUint(N, Value);
+  }
+
   template <typename T> static inline bool WouldOverflowAdd(T X, T Y) {
     return ((X > 0 && Y > 0 && (X > std::numeric_limits<T>::max() - Y)) ||
             (X < 0 && Y < 0 && (X < std::numeric_limits<T>::min() - Y)));
   }
 
+  template <typename T> static inline bool IsAligned(T X, intptr_t N) {
+    assert(llvm::isPowerOf2_64(N));
+    return (X & (N - 1)) == 0;
+  }
+
   static inline uint64_t OffsetToAlignment(uint64_t Pos, uint64_t Align) {
     assert(llvm::isPowerOf2_64(Align));
     uint64_t Mod = Pos & (Align - 1);
diff --git a/src/assembler.h b/src/assembler.h
index 84955e5..cf65d94 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -182,6 +182,7 @@
 
   virtual SizeT getBundleAlignLog2Bytes() const = 0;
 
+  virtual const char *getNonExecPadDirective() const = 0;
   virtual llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const = 0;
 
   // Mark the current text location as the start of a CFG node
diff --git a/src/assembler_arm32.h b/src/assembler_arm32.h
index 6c8a958..54eadec 100644
--- a/src/assembler_arm32.h
+++ b/src/assembler_arm32.h
@@ -42,29 +42,32 @@
   }
   ~AssemblerARM32() override = default;
 
-  void alignFunction() override {
-    llvm::report_fatal_error("Not yet implemented.");
-  }
+  void alignFunction() override { llvm_unreachable("Not yet implemented."); }
 
   SizeT getBundleAlignLog2Bytes() const override { return 4; }
 
+  const char *getNonExecPadDirective() const override { return ".p2alignl"; }
+
   llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override {
-    llvm::report_fatal_error("Not yet implemented.");
+    // Use a particular UDF encoding -- TRAPNaCl in LLVM: 0xE7FEDEF0
+    // http://llvm.org/viewvc/llvm-project?view=revision&revision=173943
+    static const uint8_t Padding[] = {0xE7, 0xFE, 0xDE, 0xF0};
+    return llvm::ArrayRef<uint8_t>(Padding, 4);
   }
 
   void padWithNop(intptr_t Padding) override {
     (void)Padding;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
   }
 
   void BindCfgNodeLabel(SizeT NodeNumber) override {
     (void)NodeNumber;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
   }
 
   bool fixupIsPCRel(FixupKind Kind) const override {
     (void)Kind;
-    llvm::report_fatal_error("Not yet implemented.");
+    llvm_unreachable("Not yet implemented.");
   }
 };
 
diff --git a/src/assembler_ia32.h b/src/assembler_ia32.h
index 4cb6ee7..f567516 100644
--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -352,6 +352,8 @@
 
   SizeT getBundleAlignLog2Bytes() const override { return 5; }
 
+  const char *getNonExecPadDirective() const override { return ".p2align"; }
+
   llvm::ArrayRef<uint8_t> getNonExecBundlePadding() const override {
     static const uint8_t Padding[] = {0xF4};
     return llvm::ArrayRef<uint8_t>(Padding, 1);
diff --git a/tests_lit/llvm2ice_tests/function_aligned.ll b/tests_lit/llvm2ice_tests/function_aligned.ll
index 02b460b..2b3da9a 100644
--- a/tests_lit/llvm2ice_tests/function_aligned.ll
+++ b/tests_lit/llvm2ice_tests/function_aligned.ll
@@ -4,6 +4,12 @@
 ; Also, we are currently using hlts for non-executable padding.
 
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
 
 define void @foo() {
   ret void
@@ -11,9 +17,16 @@
 ; CHECK-LABEL: foo
 ; CHECK-NEXT: 0: {{.*}} ret
 ; CHECK-NEXT: 1: {{.*}} hlt
+; ARM32-LABEL: foo
+; ARM32-NEXT: 0: {{.*}} bx lr
+; ARM32-NEXT: 4: e7fedef0 udf
+; ARM32-NEXT: 8: e7fedef0 udf
+; ARM32-NEXT: c: e7fedef0 udf
 
 define void @bar() {
   ret void
 }
 ; CHECK-LABEL: bar
 ; CHECK-NEXT: 20: {{.*}} ret
+; ARM32-LABEL: bar
+; ARM32-NEXT: 10: {{.*}} bx lr