Instrumented local variables and implemented runtime.

BUG=https://bugs.chromium.org/p/nativeclient/issues/detail?id=4374
R=kschimpf@google.com

Review URL: https://codereview.chromium.org/2095763002 .
diff --git a/runtime/szrt_asan.c b/runtime/szrt_asan.c
index 9f62e28..6fab295 100644
--- a/runtime/szrt_asan.c
+++ b/runtime/szrt_asan.c
@@ -16,48 +16,133 @@
 //===----------------------------------------------------------------------===//
 
 #include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/mman.h>
 
-static __thread int behind_malloc = 0;
+#define RZ_SIZE (32)
+#define SHADOW_SCALE_LOG2 (3)
+#define SHADOW_SCALE ((size_t)1 << SHADOW_SCALE_LOG2)
 
-// TODO(tlively): Define and implement this library
+// Assuming 48 bit address space on 64 bit systems
+#define SHADOW_LENGTH_64 (1u << (48 - SHADOW_SCALE_LOG2))
+#define SHADOW_LENGTH_32 (1u << (32 - SHADOW_SCALE_LOG2))
+#define IS_32_BIT (sizeof(void *) == 4)
+
+#define SHADOW_OFFSET(p) ((uintptr_t)(p) % SHADOW_SCALE)
+#define IS_SHADOW_ALIGNED(p) (SHADOW_OFFSET(p) == 0)
+
+#define MEM2SHADOW(p) (((uintptr_t)(p) >> SHADOW_SCALE_LOG2) + shadow_offset)
+#define SHADOW2MEM(p)                                                          \
+  ((uintptr_t)((char *)(p)-shadow_offset) << SHADOW_SCALE_LOG2)
+
+#define POISON_VAL (-1)
+
+static char *shadow_offset = NULL;
+
+void __asan_init(void);
+void __asan_check(char *, int);
+void *__asan_malloc(size_t);
+void __asan_free(char *);
+void __asan_poison(char *, int);
+void __asan_unpoison(char *, int);
+
 void __asan_init(void) {
-  if (behind_malloc == 0)
-    printf("set up shadow memory here\n");
+  // ensure the redzones are large enough to hold metadata
+  assert(RZ_SIZE >= sizeof(void *) && RZ_SIZE >= sizeof(size_t));
+  assert(shadow_offset == NULL);
+  size_t length = (IS_32_BIT) ? SHADOW_LENGTH_32 : SHADOW_LENGTH_64;
+  int prot = PROT_READ | PROT_WRITE;
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  int fd = -1;
+  off_t offset = 0;
+  shadow_offset = mmap((void *)length, length, prot, flags, fd, offset);
+  if (shadow_offset == NULL)
+    fprintf(stderr, "unable to allocate shadow memory\n");
+  else
+    printf("set up shadow memory at %p\n", shadow_offset);
+  if (mprotect(MEM2SHADOW(shadow_offset), length >> SHADOW_SCALE_LOG2,
+               PROT_NONE))
+    fprintf(stderr, "could not protect bad region\n");
+  else
+    printf("protected bad region\n");
 }
 
-void __asan_check(void *addr, int size) {
-  if (behind_malloc == 0)
-    printf("check %d bytes at %p\n", size, addr);
+void __asan_check(char *ptr, int size) {
+  printf("check %d bytes at %p\n", size, ptr);
+  char *end = ptr + size;
+  for (; ptr < end; ++ptr) {
+    char shadow = *(char *)MEM2SHADOW(ptr);
+    printf("checking %p with shadow %d\n", ptr, shadow);
+    assert(shadow == 0 || (shadow > 0 && SHADOW_OFFSET(ptr) <= shadow));
+  }
 }
 
 void *__asan_malloc(size_t size) {
-  if (behind_malloc == 0)
-    printf("malloc() called with size %d\n", size);
-  ++behind_malloc;
-  void *ret = malloc(size);
-  --behind_malloc;
-  assert(behind_malloc >= 0);
+  printf("malloc() called with size %d\n", size);
+  size_t padding =
+      (IS_SHADOW_ALIGNED(size)) ? 0 : SHADOW_SCALE - SHADOW_OFFSET(size);
+  size_t rz_left_size = RZ_SIZE;
+  size_t rz_right_size = RZ_SIZE + padding;
+  void *rz_left;
+  int err = posix_memalign(&rz_left, SHADOW_SCALE,
+                           rz_left_size + size + rz_right_size);
+  if (err != 0) {
+    assert(err == ENOMEM);
+    return NULL;
+  }
+  void *ret = rz_left + rz_left_size;
+  void *rz_right = ret + size;
+  __asan_poison(rz_left, rz_left_size);
+  __asan_poison(rz_right, rz_right_size);
+  // record size and location data so we can find it again
+  *(void **)rz_left = rz_right;
+  *(size_t *)rz_right = rz_right_size;
+  assert((uintptr_t)ret % 8 == 0);
   return ret;
 }
 
-void __asan_free(void *ptr) {
-  if (behind_malloc == 0)
-    printf("free() called on %p\n", ptr);
-  ++behind_malloc;
-  free(ptr);
-  --behind_malloc;
-  assert(behind_malloc >= 0);
+void __asan_free(char *ptr) {
+  printf("free() called on %p\n", ptr);
+  void *rz_left = ptr - RZ_SIZE;
+  void *rz_right = *(void **)rz_left;
+  size_t rz_right_size = *(size_t *)rz_right;
+  __asan_unpoison(rz_left, RZ_SIZE);
+  __asan_unpoison(rz_right, rz_right_size);
+  free(rz_left);
 }
 
-void __asan_alloca(void *ptr, int size) {
-  if (behind_malloc == 0)
-    printf("alloca of %d bytes at %p\n", size, ptr);
+void __asan_poison(char *ptr, int size) {
+  char *end = ptr + size;
+  assert(IS_SHADOW_ALIGNED(end));
+  // redzones should be no greater than RZ_SIZE + RZ_SIZE-1 for alignment
+  assert(size < 2 * RZ_SIZE);
+  printf("poison %d bytes at %p: %p - %p\n", size, ptr, MEM2SHADOW(ptr),
+         MEM2SHADOW(end));
+  size_t offset = SHADOW_OFFSET(ptr);
+  *(char *)MEM2SHADOW(ptr) = (offset == 0) ? POISON_VAL : offset;
+  ptr += SHADOW_OFFSET(size);
+  assert(IS_SHADOW_ALIGNED(ptr));
+  for (; ptr != end; ptr += SHADOW_SCALE) {
+    *(char *)MEM2SHADOW(ptr) = POISON_VAL;
+  }
 }
 
-void __asan_unalloca(void *ptr, int size) {
-  if (behind_malloc == 0)
-    printf("unalloca of %d bytes as %p\n", size, ptr);
+void __asan_unpoison(char *ptr, int size) {
+  char *end = ptr + size;
+  assert(IS_SHADOW_ALIGNED(end));
+  assert(size < 2 * RZ_SIZE);
+  printf("unpoison %d bytes at %p: %p - %p\n", size, ptr, MEM2SHADOW(ptr),
+         MEM2SHADOW(end));
+  *(char *)MEM2SHADOW(ptr) = 0;
+  ptr += SHADOW_OFFSET(size);
+  assert(IS_SHADOW_ALIGNED(ptr));
+  for (; ptr != end; ptr += SHADOW_SCALE) {
+    *(char *)MEM2SHADOW(ptr) = 0;
+  }
 }
diff --git a/src/IceASanInstrumentation.cpp b/src/IceASanInstrumentation.cpp
index f4b47e1..83ebc19 100644
--- a/src/IceASanInstrumentation.cpp
+++ b/src/IceASanInstrumentation.cpp
@@ -24,10 +24,12 @@
 
 #include <sstream>
 #include <unordered_map>
+#include <vector>
 
 namespace Ice {
 
 namespace {
+
 constexpr SizeT RzSize = 32;
 const std::string RzPrefix = "__$rz";
 const llvm::NaClBitcodeRecord::RecordVector RzContents =
@@ -42,6 +44,9 @@
 
 } // end of anonymous namespace
 
+ICE_TLS_DEFINE_FIELD(std::vector<InstCall *> *, ASanInstrumentation,
+                     LocalDtors);
+
 // Create redzones around all global variables, ensuring that the initializer
 // types of the redzones and their associated globals match so that they are
 // laid out together in memory.
@@ -126,38 +131,95 @@
 // Check for an alloca signaling the presence of local variables and add a
 // redzone if it is found
 void ASanInstrumentation::instrumentFuncStart(LoweringContext &Context) {
-  auto *FirstAlloca = llvm::dyn_cast<InstAlloca>(Context.getCur());
-  if (FirstAlloca == nullptr)
-    return;
+  if (ICE_TLS_GET_FIELD(LocalDtors) == nullptr)
+    ICE_TLS_SET_FIELD(LocalDtors, new std::vector<InstCall *>());
 
-  constexpr SizeT Alignment = 4;
-  InstAlloca *RzAlloca = createLocalRz(Context, RzSize, Alignment);
-
-  // insert before the current instruction
-  InstList::iterator Next = Context.getNext();
-  Context.setInsertPoint(Context.getCur());
-  Context.insert(RzAlloca);
-  Context.setNext(Next);
-}
-
-void ASanInstrumentation::instrumentAlloca(LoweringContext &Context,
-                                           InstAlloca *Instr) {
-  auto *VarSizeOp = llvm::dyn_cast<ConstantInteger32>(Instr->getSizeInBytes());
-  SizeT VarSize = (VarSizeOp == nullptr) ? RzSize : VarSizeOp->getValue();
-  SizeT Padding = Utils::OffsetToAlignment(VarSize, RzSize);
-  constexpr SizeT Alignment = 1;
-  InstAlloca *Rz = createLocalRz(Context, RzSize + Padding, Alignment);
-  Context.insert(Rz);
-}
-
-InstAlloca *ASanInstrumentation::createLocalRz(LoweringContext &Context,
-                                               SizeT Size, SizeT Alignment) {
   Cfg *Func = Context.getNode()->getCfg();
-  Variable *Rz = Func->makeVariable(IceType_i32);
-  Rz->setName(Func, nextRzName());
-  auto *ByteCount = ConstantInteger32::create(Ctx, IceType_i32, Size);
-  auto *RzAlloca = InstAlloca::create(Func, Rz, ByteCount, Alignment);
-  return RzAlloca;
+  bool HasLocals = false;
+  LoweringContext C;
+  C.init(Context.getNode());
+  std::vector<Inst *> Initializations;
+  Constant *InitFunc =
+      Ctx->getConstantExternSym(Ctx->getGlobalString("__asan_poison"));
+  Constant *DestroyFunc =
+      Ctx->getConstantExternSym(Ctx->getGlobalString("__asan_unpoison"));
+
+  InstAlloca *Cur;
+  ConstantInteger32 *VarSizeOp;
+  while (
+      (Cur = llvm::dyn_cast<InstAlloca>(iteratorToInst(C.getCur()))) &&
+      (VarSizeOp = llvm::dyn_cast<ConstantInteger32>(Cur->getSizeInBytes()))) {
+    HasLocals = true;
+
+    // create the new alloca that includes a redzone
+    SizeT VarSize = VarSizeOp->getValue();
+    Variable *Dest = Cur->getDest();
+    SizeT RzPadding = RzSize + Utils::OffsetToAlignment(VarSize, RzSize);
+    auto *ByteCount =
+        ConstantInteger32::create(Ctx, IceType_i32, VarSize + RzPadding);
+    constexpr SizeT Alignment = 8;
+    auto *NewVar = InstAlloca::create(Func, Dest, ByteCount, Alignment);
+
+    // calculate the redzone offset
+    Variable *RzLocVar = Func->makeVariable(IceType_i32);
+    RzLocVar->setName(Func, nextRzName());
+    auto *Offset = ConstantInteger32::create(Ctx, IceType_i32, VarSize);
+    auto *RzLoc = InstArithmetic::create(Func, InstArithmetic::Add, RzLocVar,
+                                         Dest, Offset);
+
+    // instructions to poison and unpoison the redzone
+    constexpr SizeT NumArgs = 2;
+    constexpr Variable *Void = nullptr;
+    constexpr bool NoTailcall = false;
+    auto *Init = InstCall::create(Func, NumArgs, Void, InitFunc, NoTailcall);
+    auto *Destroy =
+        InstCall::create(Func, NumArgs, Void, DestroyFunc, NoTailcall);
+    Init->addArg(RzLocVar);
+    Destroy->addArg(RzLocVar);
+    auto *RzSizeConst = ConstantInteger32::create(Ctx, IceType_i32, RzPadding);
+    Init->addArg(RzSizeConst);
+    Destroy->addArg(RzSizeConst);
+
+    Cur->setDeleted();
+    C.insert(NewVar);
+    ICE_TLS_GET_FIELD(LocalDtors)->emplace_back(Destroy);
+    Initializations.emplace_back(RzLoc);
+    Initializations.emplace_back(Init);
+
+    C.advanceCur();
+    C.advanceNext();
+  }
+
+  C.setInsertPoint(C.getCur());
+
+  // add the leftmost redzone
+  if (HasLocals) {
+    Variable *LastRz = Func->makeVariable(IceType_i32);
+    LastRz->setName(Func, nextRzName());
+    auto *ByteCount = ConstantInteger32::create(Ctx, IceType_i32, RzSize);
+    constexpr SizeT Alignment = 8;
+    auto *RzAlloca = InstAlloca::create(Func, LastRz, ByteCount, Alignment);
+
+    constexpr SizeT NumArgs = 2;
+    constexpr Variable *Void = nullptr;
+    constexpr bool NoTailcall = false;
+    auto *Init = InstCall::create(Func, NumArgs, Void, InitFunc, NoTailcall);
+    auto *Destroy =
+        InstCall::create(Func, NumArgs, Void, DestroyFunc, NoTailcall);
+    Init->addArg(LastRz);
+    Destroy->addArg(LastRz);
+    Init->addArg(RzAlloca->getSizeInBytes());
+    Destroy->addArg(RzAlloca->getSizeInBytes());
+
+    ICE_TLS_GET_FIELD(LocalDtors)->emplace_back(Destroy);
+    C.insert(RzAlloca);
+    C.insert(Init);
+  }
+
+  // insert initializers for the redzones
+  for (Inst *Init : Initializations) {
+    C.insert(Init);
+  }
 }
 
 void ASanInstrumentation::instrumentCall(LoweringContext &Context,
@@ -214,6 +276,15 @@
   Context.setNext(Next);
 }
 
+void ASanInstrumentation::instrumentRet(LoweringContext &Context, InstRet *) {
+  InstList::iterator Next = Context.getNext();
+  Context.setInsertPoint(Context.getCur());
+  for (InstCall *RzUnpoison : *ICE_TLS_GET_FIELD(LocalDtors)) {
+    Context.insert(RzUnpoison);
+  }
+  Context.setNext(Next);
+}
+
 void ASanInstrumentation::instrumentStart(Cfg *Func) {
   Constant *ShadowMemInit =
       Ctx->getConstantExternSym(Ctx->getGlobalString("__asan_init"));
@@ -224,4 +295,10 @@
   Func->getEntryNode()->getInsts().push_front(Call);
 }
 
+// TODO(tlively): make this more efficient with swap idiom
+void ASanInstrumentation::finishFunc(Cfg *Func) {
+  (void)Func;
+  ICE_TLS_GET_FIELD(LocalDtors)->clear();
+}
+
 } // end of namespace Ice
diff --git a/src/IceASanInstrumentation.h b/src/IceASanInstrumentation.h
index 2cf5c59..de250d4 100644
--- a/src/IceASanInstrumentation.h
+++ b/src/IceASanInstrumentation.h
@@ -31,7 +31,9 @@
   ASanInstrumentation &operator=(const ASanInstrumentation &) = delete;
 
 public:
-  ASanInstrumentation(GlobalContext *Ctx) : Instrumentation(Ctx), RzNum(0) {}
+  ASanInstrumentation(GlobalContext *Ctx) : Instrumentation(Ctx), RzNum(0) {
+    ICE_TLS_INIT_FIELD(LocalDtors);
+  }
   void instrumentGlobals(VariableDeclarationList &Globals) override;
 
 private:
@@ -40,15 +42,15 @@
                                 VariableDeclaration *RzArray,
                                 SizeT &RzArraySize,
                                 VariableDeclaration *Global);
-  InstAlloca *createLocalRz(LoweringContext &Context, SizeT Size,
-                            SizeT Alignment);
   void instrumentFuncStart(LoweringContext &Context) override;
-  void instrumentAlloca(LoweringContext &Context, InstAlloca *Instr) override;
   void instrumentCall(LoweringContext &Context, InstCall *Instr) override;
+  void instrumentRet(LoweringContext &Context, InstRet *Instr) override;
   void instrumentLoad(LoweringContext &Context, InstLoad *Instr) override;
   void instrumentStore(LoweringContext &Context, InstStore *Instr) override;
   void instrumentAccess(LoweringContext &Context, Operand *Op, SizeT Size);
   void instrumentStart(Cfg *Func) override;
+  void finishFunc(Cfg *Func) override;
+  ICE_TLS_DECLARE_FIELD(std::vector<InstCall *> *, LocalDtors);
   bool DidInsertRedZones = false;
   std::atomic<uint32_t> RzNum;
 };
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index a113b95..c2b4d06 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -645,16 +645,19 @@
   getTarget()->lowerArguments();
 }
 
-void Cfg::sortAndCombineAllocas(CfgVector<Inst *> &Allocas,
+void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
                                 uint32_t CombinedAlignment, InstList &Insts,
                                 AllocaBaseVariableType BaseVariableType) {
   if (Allocas.empty())
     return;
   // Sort by decreasing alignment.
-  std::sort(Allocas.begin(), Allocas.end(), [](Inst *I1, Inst *I2) {
-    auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
-    auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
-    return A1->getAlignInBytes() > A2->getAlignInBytes();
+  std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) {
+    uint32_t Align1 = A1->getAlignInBytes();
+    uint32_t Align2 = A2->getAlignInBytes();
+    if (Align1 == Align2)
+      return A1->getNumber() > A2->getNumber();
+    else
+      return Align1 > Align2;
   });
   // Process the allocas in order of decreasing stack alignment.  This allows
   // us to pack less-aligned pieces after more-aligned ones, resulting in less
@@ -746,6 +749,8 @@
   bool HasLargeAlignment = false;
   bool HasDynamicAllocation = false;
   for (Inst &Instr : EntryNode->getInsts()) {
+    if (Instr.isDeleted())
+      continue;
     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
       uint32_t AlignmentParam = Alloca->getAlignInBytes();
       if (AlignmentParam > StackAlignment)
@@ -769,6 +774,8 @@
     if (Node == EntryNode)
       continue;
     for (Inst &Instr : Node->getInsts()) {
+      if (Instr.isDeleted())
+        continue;
       if (llvm::isa<InstAlloca>(&Instr)) {
         // Allocations outside the entry block require a frame pointer.
         HasDynamicAllocation = true;
@@ -784,13 +791,15 @@
   // Collect the Allocas into the two vectors.
   // Allocas in the entry block that have constant size and alignment less
   // than or equal to the function's stack alignment.
-  CfgVector<Inst *> FixedAllocas;
+  CfgVector<InstAlloca *> FixedAllocas;
   // Allocas in the entry block that have constant size and alignment greater
   // than the function's stack alignment.
-  CfgVector<Inst *> AlignedAllocas;
+  CfgVector<InstAlloca *> AlignedAllocas;
   // Maximum alignment used by any alloca.
   uint32_t MaxAlignment = StackAlignment;
   for (Inst &Instr : EntryNode->getInsts()) {
+    if (Instr.isDeleted())
+      continue;
     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
       if (!llvm::isa<Constant>(Alloca->getSizeInBytes()))
         continue;
diff --git a/src/IceCfg.h b/src/IceCfg.h
index e3f29c7..c656961 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -295,7 +295,7 @@
     BVT_FramePointer,
     BVT_UserPointer
   };
-  void sortAndCombineAllocas(CfgVector<Inst *> &Allocas,
+  void sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
                              uint32_t CombinedAlignment, InstList &Insts,
                              AllocaBaseVariableType BaseVariableType);
   void findRematerializable();
diff --git a/src/IceInstrumentation.cpp b/src/IceInstrumentation.cpp
index 64a6212..c911a0b 100644
--- a/src/IceInstrumentation.cpp
+++ b/src/IceInstrumentation.cpp
@@ -51,6 +51,8 @@
   std::string FuncName = Func->getFunctionName().toStringOrEmpty();
   if (FuncName == "_start")
     instrumentStart(Func);
+
+  finishFunc(Func);
 }
 
 void Instrumentation::instrumentInst(LoweringContext &Context) {
diff --git a/src/IceInstrumentation.h b/src/IceInstrumentation.h
index 60afef7..3a18542 100644
--- a/src/IceInstrumentation.h
+++ b/src/IceInstrumentation.h
@@ -44,8 +44,10 @@
   virtual void instrumentGlobals(VariableDeclarationList &) {}
   void instrumentFunc(Cfg *Func);
 
+protected:
+  virtual void instrumentInst(LoweringContext &Context);
+
 private:
-  void instrumentInst(LoweringContext &Context);
   virtual void instrumentFuncStart(LoweringContext &) {}
   virtual void instrumentAlloca(LoweringContext &, class InstAlloca *) {}
   virtual void instrumentArithmetic(LoweringContext &, class InstArithmetic *) {
@@ -71,6 +73,7 @@
                                      class InstUnreachable *) {}
   virtual void instrumentStart(Cfg *) {}
   virtual void instrumentLocalVars(Cfg *) {}
+  virtual void finishFunc(Cfg *) {}
 
 protected:
   GlobalContext *Ctx;
diff --git a/tests_lit/asan_tests/alignment.ll b/tests_lit/asan_tests/alignment.ll
new file mode 100644
index 0000000..53e96b3
--- /dev/null
+++ b/tests_lit/asan_tests/alignment.ll
@@ -0,0 +1,43 @@
+; Translate with -fsanitize-address and -O2 to test alignment and ordering of
+; redzones when allocas are coalesced.
+
+; REQUIRES: no_minimal_build
+
+; RUN: %p2i --filetype=obj --disassemble --target x8632 -i %s --args -O2 \
+; RUN:     -allow-externally-defined-symbols -fsanitize-address | FileCheck %s
+
+define internal i32 @func(i32 %arg1, i32 %arg2) {
+  %l1 = alloca i8, i32 4, align 4
+  %l2 = alloca i8, i32 5, align 1
+  ret i32 42
+}
+
+; CHECK: func
+; CHECK-NEXT: sub    esp,0xbc
+; CHECK-NEXT: lea    eax,[esp+0x10]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x20
+; CHECK-NEXT: __asan_poison
+; CHECK-NEXT: lea    eax,[esp+0x74]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x3c
+; CHECK-NEXT: __asan_poison
+; CHECK-NEXT: lea    eax,[esp+0x35]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x3b
+; CHECK-NEXT: __asan_poison
+; CHECK-NEXT: lea    eax,[esp+0x74]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x3c
+; CHECK-NEXT: __asan_unpoison
+; CHECK-NEXT: lea    eax,[esp+0x35]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x3b
+; CHECK-NEXT: __asan_unpoison
+; CHECK-NEXT: lea    eax,[esp+0x10]
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x4],0x20
+; CHECK-NEXT: __asan_unpoison
+; CHECK-NEXT: mov    eax,0x2a
+; CHECK-NEXT: add    esp,0xbc
+; CHECK-NEXT: ret
diff --git a/tests_lit/asan_tests/instrumentload.ll b/tests_lit/asan_tests/instrumentload.ll
index 55a11df..037e71c 100644
--- a/tests_lit/asan_tests/instrumentload.ll
+++ b/tests_lit/asan_tests/instrumentload.ll
@@ -69,7 +69,7 @@
 ; DUMP-LABEL: ================ Instrumented CFG ================
 ; DUMP-NEXT: define internal void @doLoads() {
 ; DUMP-NEXT: __0:
-; DUMP: call void @__asan_check(i32 @srcConst8, i32 1)
+; DUMP:      call void @__asan_check(i32 @srcConst8, i32 1)
 ; DUMP-NEXT: %dest1 = load i8, i8* @srcConst8, align 1
 ; DUMP-NEXT: call void @__asan_check(i32 @srcConst16, i32 2)
 ; DUMP-NEXT: %dest2 = load i16, i16* @srcConst16, align 1
@@ -99,5 +99,5 @@
 ; DUMP-NEXT: %dest14 = load i64, i64* %srcLocal64, align 1
 ; DUMP-NEXT: call void @__asan_check(i32 %srcLocal128, i32 16)
 ; DUMP-NEXT: %dest15 = load <4 x i32>, <4 x i32>* %srcLocal128, align 4
-; DUMP-NEXT: ret void
-; DUMP-NEXT }
+; DUMP:      ret void
+; DUMP-NEXT: }
diff --git a/tests_lit/asan_tests/instrumentlocals.ll b/tests_lit/asan_tests/instrumentlocals.ll
index 1031382..b8b7a03 100644
--- a/tests_lit/asan_tests/instrumentlocals.ll
+++ b/tests_lit/asan_tests/instrumentlocals.ll
@@ -1,4 +1,4 @@
-; Test for insertion of redzones around global variables
+; Test for insertion of redzones around local variables
 
 ; REQUIRES: allow_dump
 
@@ -7,42 +7,39 @@
 
 ; Function with local variables to be instrumented
 define internal void @func() {
-  %local0 = alloca i8, i32 4, align 4
-  %local1 = alloca i8, i32 32, align 4
-  %local2 = alloca i8, i32 13, align 4
-  %local3 = alloca i8, i32 75, align 4
-  %local4 = alloca i8, i32 64, align 4
-  %local5 = alloca i8, i32 4, align 1
-  %local6 = alloca i8, i32 32, align 1
-  %local7 = alloca i8, i32 13, align 1
-  %local8 = alloca i8, i32 75, align 1
-  %local9 = alloca i8, i32 64, align 1
+  %local1 = alloca i8, i32 4, align 4
+  %local2 = alloca i8, i32 32, align 1
+  %local3 = alloca i8, i32 13, align 2
+  %local4 = alloca i8, i32 75, align 4
+  %local5 = alloca i8, i32 64, align 8
   ret void
 }
 
 ; DUMP-LABEL: ================ Instrumented CFG ================
 ; DUMP-NEXT: define internal void @func() {
 ; DUMP-NEXT: __0:
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 32, align 4
-; DUMP-NEXT: %local0 = alloca i8, i32 4, align 4
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 60, align 1
-; DUMP-NEXT: %local1 = alloca i8, i32 32, align 4
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 32, align 1
-; DUMP-NEXT: %local2 = alloca i8, i32 13, align 4
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 51, align 1
-; DUMP-NEXT: %local3 = alloca i8, i32 75, align 4
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 53, align 1
-; DUMP-NEXT: %local4 = alloca i8, i32 64, align 4
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 32, align 1
-; DUMP-NEXT: %local5 = alloca i8, i32 4, align 1
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 60, align 1
-; DUMP-NEXT: %local6 = alloca i8, i32 32, align 1
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 32, align 1
-; DUMP-NEXT: %local7 = alloca i8, i32 13, align 1
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 51, align 1
-; DUMP-NEXT: %local8 = alloca i8, i32 75, align 1
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 53, align 1
-; DUMP-NEXT: %local9 = alloca i8, i32 64, align 
-; DUMP-NEXT: %__$rz{{[0-9]+}} = alloca i8, i32 32, align 1
+; DUMP-NEXT: %local1 = alloca i8, i32 64, align 8
+; DUMP-NEXT: %local2 = alloca i8, i32 64, align 8
+; DUMP-NEXT: %local3 = alloca i8, i32 64, align 8
+; DUMP-NEXT: %local4 = alloca i8, i32 128, align 8
+; DUMP-NEXT: %local5 = alloca i8, i32 96, align 8
+; DUMP-NEXT: %__$rz[[RZ0:[0-9]+]] = alloca i8, i32 32, align 8
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ0]], i32 32)
+; DUMP-NEXT: %__$rz[[RZ1:[0-9]+]] = add i32 %local1, 4
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ1]], i32 60)
+; DUMP-NEXT: %__$rz[[RZ2:[0-9]+]] = add i32 %local2, 32
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ2]], i32 32)
+; DUMP-NEXT: %__$rz[[RZ3:[0-9]+]] = add i32 %local3, 13
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ3]], i32 51)
+; DUMP-NEXT: %__$rz[[RZ4:[0-9]+]] = add i32 %local4, 75
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ4]], i32 53)
+; DUMP-NEXT: %__$rz[[RZ5:[0-9]+]] = add i32 %local5, 64
+; DUMP-NEXT: call void @__asan_poison(i32 %__$rz[[RZ5]], i32 32)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ1]], i32 60)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ2]], i32 32)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ3]], i32 51)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ4]], i32 53)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ5]], i32 32)
+; DUMP-NEXT: call void @__asan_unpoison(i32 %__$rz[[RZ0]], i32 32)
 ; DUMP-NEXT: ret void
 ; DUMP-NEXT: }
diff --git a/tests_lit/asan_tests/instrumentstore.ll b/tests_lit/asan_tests/instrumentstore.ll
index c81edac..0a4c94d 100644
--- a/tests_lit/asan_tests/instrumentstore.ll
+++ b/tests_lit/asan_tests/instrumentstore.ll
@@ -70,5 +70,5 @@
 ; DUMP-NEXT: store i64 42, i64* %destLocal64, align 1
 ; DUMP-NEXT: call void @__asan_check(i32 %destLocal128, i32 16)
 ; DUMP-NEXT: store <4 x i32> %vecSrc, <4 x i32>* %destLocal128, align 4
-; DUMP-NEXT: ret void
+; DUMP:      ret void
 ; DUMP-NEXT: }
diff --git a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
index 1b009ba..d76755e 100644
--- a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
@@ -49,16 +49,16 @@
 ; CHECK-LABEL:  caller2
 ; CHECK-NEXT:   sub    esp,0x6c
 ; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x70]
-; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x40],eax
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
-; CHECK-NEXT:   lea    eax,[esp+0x20]
+; CHECK-NEXT:   lea    eax,[esp+0x40]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x4],eax
-; CHECK-NEXT:   lea    eax,[esp+0x40]
-; CHECK-NEXT:   mov    DWORD PTR [esp+0x8],eax
 ; CHECK-NEXT:   lea    eax,[esp+0x20]
-; CHECK-NEXT:   mov    DWORD PTR [esp+0xc],eax
+; CHECK-NEXT:   mov    DWORD PTR [esp+0x8],eax
 ; CHECK-NEXT:   lea    eax,[esp+0x40]
+; CHECK-NEXT:   mov    DWORD PTR [esp+0xc],eax
+; CHECK-NEXT:   lea    eax,[esp+0x20]
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x10],eax
 ; CHECK-NEXT:   call
 ; CHECK-NEXT:   add    esp,0x6c
diff --git a/tests_lit/llvm2ice_tests/fused-alloca.ll b/tests_lit/llvm2ice_tests/fused-alloca.ll
index 42e644b..3b37ada 100644
--- a/tests_lit/llvm2ice_tests/fused-alloca.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca.ll
@@ -21,8 +21,8 @@
 ; CHECK-LABEL: fused_small_align
 ; CHECK-NEXT: sub    esp,0x30
 ; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x34]
+; CHECK-NEXT: mov    DWORD PTR [esp+0x1c],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x10],eax
-; CHECK-NEXT: mov    DWORD PTR [esp+0x18],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp],eax
 ; CHECK-NEXT: add    esp,0x30
 
@@ -46,9 +46,9 @@
 ; CHECK-NEXT: sub    esp,0x80
 ; CHECK-NEXT: and    esp,0xffffffc0
 ; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
-; CHECK-NEXT: mov    DWORD PTR [esp+0x40],eax
-; CHECK-NEXT: mov    DWORD PTR [esp],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x60],eax
+; CHECK-NEXT: mov    DWORD PTR [esp],eax
+; CHECK-NEXT: mov    DWORD PTR [esp+0x40],eax
 ; CHECK-NEXT: mov    esp,ebp
 ; CHECK-NEXT: pop    ebp
 
@@ -154,10 +154,10 @@
 ; CHECK-NEXT: add    ecx,0x0
 ; CHECK-NEXT: sub    esp,0x10
 ; CHECK-NEXT: mov    ebx,esp
-; CHECK-NEXT: mov    DWORD PTR [ecx],eax
 ; CHECK-NEXT: mov    DWORD PTR [edx],eax
-; CHECK-NEXT: mov    DWORD PTR [ebp-0x14],eax
+; CHECK-NEXT: mov    DWORD PTR [ecx],eax
 ; CHECK-NEXT: mov    DWORD PTR [ebp-0x24],eax
+; CHECK-NEXT: mov    DWORD PTR [ebp-0x14],eax
 ; CHECK-NEXT: mov    DWORD PTR [ebx],eax
 ; CHECK-NEXT: mov    esp,ebp
 ; CHECK-NEXT: pop    ebp