diff --git a/third_party/marl/include/marl/conditionvariable.h b/third_party/marl/include/marl/conditionvariable.h
index 5a57db2..8579dcb 100644
--- a/third_party/marl/include/marl/conditionvariable.h
+++ b/third_party/marl/include/marl/conditionvariable.h
@@ -17,7 +17,6 @@
 
 #include "containers.h"
 #include "debug.h"
-#include "defer.h"
 #include "memory.h"
 #include "mutex.h"
 #include "scheduler.h"
@@ -159,10 +158,10 @@
   if (pred()) {
     return true;
   }
-  numWaiting++;
-  defer(numWaiting--);
 
   if (auto fiber = Scheduler::Fiber::current()) {
+    numWaiting++;
+
     // Currently executing on a scheduler fiber.
     // Yield to let other tasks run that can unblock this fiber.
     mutex.lock();
@@ -175,14 +174,18 @@
     waiting.erase(it);
     mutex.unlock();
 
+    numWaiting--;
     return res;
-  } else {
-    // Currently running outside of the scheduler.
-    // Delegate to the std::condition_variable.
-    numWaitingOnCondition++;
-    defer(numWaitingOnCondition--);
-    return lock.wait_until(condition, timeout, pred);
   }
+
+  // Currently running outside of the scheduler.
+  // Delegate to the std::condition_variable.
+  numWaiting++;
+  numWaitingOnCondition++;
+  auto res = lock.wait_until(condition, timeout, pred);
+  numWaitingOnCondition--;
+  numWaiting--;
+  return res;
 }
 
 }  // namespace marl
diff --git a/third_party/marl/include/marl/containers.h b/third_party/marl/include/marl/containers.h
index fcac46e..acf421c 100644
--- a/third_party/marl/include/marl/containers.h
+++ b/third_party/marl/include/marl/containers.h
@@ -292,6 +292,11 @@
   list& operator=(const list&) = delete;
   list& operator=(list&&) = delete;
 
+  struct AllocationChain {
+    Allocation allocation;
+    AllocationChain* next;
+  };
+
   void grow(size_t count);
 
   static void unlink(Entry* entry, Entry*& list);
@@ -300,7 +305,7 @@
   Allocator* const allocator;
   size_t size_ = 0;
   size_t capacity = 0;
-  vector<Allocation, 8> allocations;
+  AllocationChain* allocations = nullptr;
   Entry* free = nullptr;
   Entry* head = nullptr;
 };
@@ -336,17 +341,19 @@
 
 template <typename T>
 list<T>::list(Allocator* allocator /* = Allocator::Default */)
-    : allocator(allocator), allocations(allocator) {
-  grow(8);
-}
+    : allocator(allocator) {}
 
 template <typename T>
 list<T>::~list() {
   for (auto el = head; el != nullptr; el = el->next) {
     el->data.~T();
   }
-  for (auto alloc : allocations) {
-    allocator->free(alloc);
+
+  auto curr = allocations;
+  while (curr != nullptr) {
+    auto next = curr->next;
+    allocator->free(curr->allocation);
+    curr = next;
   }
 }
 
@@ -369,7 +376,7 @@
 template <typename... Args>
 typename list<T>::iterator list<T>::emplace_front(Args&&... args) {
   if (free == nullptr) {
-    grow(capacity);
+    grow(std::max<size_t>(capacity, 8));
   }
 
   auto entry = free;
@@ -395,9 +402,13 @@
 
 template <typename T>
 void list<T>::grow(size_t count) {
+  auto const entriesSize = sizeof(Entry) * count;
+  auto const allocChainOffset = alignUp(entriesSize, alignof(AllocationChain));
+  auto const allocSize = allocChainOffset + sizeof(AllocationChain);
+
   Allocation::Request request;
-  request.size = sizeof(Entry) * count;
-  request.alignment = alignof(Entry);
+  request.size = allocSize;
+  request.alignment = std::max(alignof(Entry), alignof(AllocationChain));
   request.usage = Allocation::Usage::List;
   auto alloc = allocator->allocate(request);
 
@@ -412,7 +423,12 @@
     free = entry;
   }
 
-  allocations.emplace_back(std::move(alloc));
+  auto allocChain = reinterpret_cast<AllocationChain*>(
+      reinterpret_cast<uint8_t*>(alloc.ptr) + allocChainOffset);
+
+  allocChain->allocation = alloc;
+  allocChain->next = allocations;
+  allocations = allocChain;
 
   capacity += count;
 }
diff --git a/third_party/marl/include/marl/event.h b/third_party/marl/include/marl/event.h
index bac6078..dbc9f4f 100644
--- a/third_party/marl/include/marl/event.h
+++ b/third_party/marl/include/marl/event.h
@@ -26,7 +26,7 @@
 // Event is a synchronization primitive used to block until a signal is raised.
 class Event {
  public:
-  enum class Mode {
+  enum class Mode : uint8_t {
     // The event signal will be automatically reset when a call to wait()
     // returns.
     // A single call to signal() will only unblock a single (possibly
@@ -115,9 +115,9 @@
 
     marl::mutex mutex;
     ConditionVariable cv;
+    containers::vector<std::shared_ptr<Shared>, 1> deps;
     const Mode mode;
     bool signalled;
-    containers::vector<std::shared_ptr<Shared>, 2> deps;
   };
 
   const std::shared_ptr<Shared> shared;
diff --git a/third_party/marl/include/marl/memory.h b/third_party/marl/include/marl/memory.h
index 8c35e01..8608851 100644
--- a/third_party/marl/include/marl/memory.h
+++ b/third_party/marl/include/marl/memory.h
@@ -31,10 +31,15 @@
 // system.
 size_t pageSize();
 
+template <typename T>
+inline T alignUp(T val, T alignment) {
+  return alignment * ((val + alignment - 1) / alignment);
+}
+
 // Allocation holds the result of a memory allocation from an Allocator.
 struct Allocation {
   // Intended usage of the allocation. Used for allocation trackers.
-  enum class Usage {
+  enum class Usage : uint8_t {
     Undefined = 0,
     Stack,   // Fiber stack
     Create,  // Allocator::create(), make_unique(), make_shared()
diff --git a/third_party/marl/include/marl/parallelize.h b/third_party/marl/include/marl/parallelize.h
index d7ceadc..ce75b02 100644
--- a/third_party/marl/include/marl/parallelize.h
+++ b/third_party/marl/include/marl/parallelize.h
@@ -22,13 +22,13 @@
 
 namespace detail {
 
-void parallelizeChain(WaitGroup*) {}
+void parallelizeChain(WaitGroup&) {}
 
 template <typename F, typename... L>
-void parallelizeChain(WaitGroup* wg, F&& f, L&&... l) {
+void parallelizeChain(WaitGroup& wg, F&& f, L&&... l) {
   schedule([=] {
     f();
-    wg->done();
+    wg.done();
   });
   parallelizeChain(wg, std::forward<L>(l)...);
 }
@@ -41,7 +41,7 @@
 template <typename... FUNCTIONS>
 inline void parallelize(FUNCTIONS&&... functions) {
   WaitGroup wg(sizeof...(FUNCTIONS));
-  detail::parallelizeChain(&wg, functions...);
+  detail::parallelizeChain(wg, functions...);
   wg.wait();
 }
 
diff --git a/third_party/marl/include/marl/pool.h b/third_party/marl/include/marl/pool.h
index 393b457..70d53c9 100644
--- a/third_party/marl/include/marl/pool.h
+++ b/third_party/marl/include/marl/pool.h
@@ -192,7 +192,7 @@
 
 template <typename T>
 T* Pool<T>::Loan::get() const {
-  return item->get();
+  return item ? item->get() : nullptr;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/marl/src/event_test.cpp b/third_party/marl/src/event_test.cpp
index d00e721..0250329 100644
--- a/third_party/marl/src/event_test.cpp
+++ b/third_party/marl/src/event_test.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "marl/event.h"
+#include "marl/defer.h"
 #include "marl/waitgroup.h"
 
 #include "marl_test.h"
diff --git a/third_party/marl/src/memory.cpp b/third_party/marl/src/memory.cpp
index b9f6cc1..3ccfa3d 100644
--- a/third_party/marl/src/memory.cpp
+++ b/third_party/marl/src/memory.cpp
@@ -129,11 +129,6 @@
 
 namespace {
 
-template <typename T>
-inline T alignUp(T val, T alignment) {
-  return alignment * ((val + alignment - 1) / alignment);
-}
-
 // pagedMalloc() allocates size bytes of uninitialized storage with the
 // specified minimum byte alignment using OS specific page mapping calls.
 // If guardLow is true then reads or writes to the page below the returned
@@ -188,8 +183,8 @@
 inline void* alignedMalloc(size_t alignment, size_t size) {
   size_t allocSize = size + alignment + sizeof(void*);
   auto allocation = malloc(allocSize);
-  auto aligned = reinterpret_cast<uint8_t*>(
-      alignUp(reinterpret_cast<uintptr_t>(allocation), alignment));  // align
+  auto aligned = reinterpret_cast<uint8_t*>(marl::alignUp(
+      reinterpret_cast<uintptr_t>(allocation), alignment));  // align
   memcpy(aligned + size, &allocation, sizeof(void*));  // pointer-to-allocation
   return aligned;
 }
diff --git a/third_party/marl/src/osfiber_test.cpp b/third_party/marl/src/osfiber_test.cpp
index 3b6fe81..d31b3a8 100644
--- a/third_party/marl/src/osfiber_test.cpp
+++ b/third_party/marl/src/osfiber_test.cpp
@@ -16,9 +16,14 @@
 
 #include "marl_test.h"
 
+namespace {
+
+auto constexpr fiberStackSize = 8 * 1024;
+
+}  // anonymous namespace
+
 TEST_F(WithoutBoundScheduler, OSFiber) {
   std::string str;
-  auto constexpr fiberStackSize = 8 * 1024;
   auto main = marl::OSFiber::createFiberFromCurrentThread(allocator);
   marl::Allocator::unique_ptr<marl::OSFiber> fiberA, fiberB, fiberC;
   fiberC = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
@@ -38,3 +43,26 @@
 
   ASSERT_EQ(str, "CBA");
 }
+
+TEST_F(WithoutBoundScheduler, StackAlignment) {
+  uintptr_t address = 0;
+
+  struct alignas(16) AlignTo16Bytes {
+    uint64_t a, b;
+  };
+
+  auto main = marl::OSFiber::createFiberFromCurrentThread(allocator);
+  marl::Allocator::unique_ptr<marl::OSFiber> fiber;
+  fiber = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
+    AlignTo16Bytes stack_var;
+
+    address = reinterpret_cast<uintptr_t>(&stack_var);
+
+    fiber->switchTo(main.get());
+  });
+
+  main->switchTo(fiber.get());
+
+  ASSERT_TRUE((address & 15) == 0)
+      << "Stack variable had unaligned address: 0x" << std::hex << address;
+}
diff --git a/third_party/marl/src/pool_test.cpp b/third_party/marl/src/pool_test.cpp
index 6ae8f25..eec09b5 100644
--- a/third_party/marl/src/pool_test.cpp
+++ b/third_party/marl/src/pool_test.cpp
@@ -26,6 +26,16 @@
   marl::BoundedPool<int, 10> pool;
 }
 
+TEST_P(WithBoundScheduler, UnboundedPoolLoan_GetNull) {
+  marl::UnboundedPool<int>::Loan loan;
+  ASSERT_EQ(loan.get(), nullptr);
+}
+
+TEST_P(WithBoundScheduler, BoundedPoolLoan_GetNull) {
+  marl::BoundedPool<int, 10>::Loan loan;
+  ASSERT_EQ(loan.get(), nullptr);
+}
+
 TEST_P(WithBoundScheduler, UnboundedPool_Borrow) {
   marl::UnboundedPool<int> pool;
   for (int i = 0; i < 100; i++) {
