Squashed 'third_party/marl/' changes from 14e4d862a..246091e81

246091e81 Implement support for MIPS64
2b3e3fe6d Core: Rework fiber scheduling to fix broken behavior.
a10646182 Add tool to diff benchmarks.
2c9624c1b Kokoro: Print test output to logs.
36208b483 Use SAL annotations more aggressively in the scheduler.
492edc979 Add benchmarks using google benchmark
32102554d Add new 'hello task' example, add to README.md
9b860935b CMake: Make the marl target's include directory public
8cdb1b1f0 CMakeLists.txt: Make it easier to specify third_party vars.
e25626a8e CMakeLists: Don't repeatedly include MARL_OS_LIBS

git-subtree-dir: third_party/marl
git-subtree-split: 246091e814658337b22f212c97d0659ea3673c9f
diff --git a/.clang-format b/.clang-format
index 2fbfe15..b0823b4 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,2 +1,10 @@
 # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-BasedOnStyle: Chromium
\ No newline at end of file
+BasedOnStyle: Chromium
+
+---
+Language:        Cpp
+StatementMacros:
+ - _Acquires_lock_
+ - _Releases_lock_
+ - _Requires_lock_held_
+ - _When_
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 5a4e85a..335eee4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark.git
diff --git a/AUTHORS b/AUTHORS
index e6ebb88..8a5bf2a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,3 +6,4 @@
 Google LLC
 Shawn Anastasio <shawn@anastas.io>
 A. Wilcox <awilfox@adelielinux.org>
+Jiaxun Yang <jiaxun.yang@flygoat.com>
diff --git a/BUILD.bazel b/BUILD.bazel
index bcb5643..3d95fe9 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -21,6 +21,7 @@
             "src/**/*.h",
         ],
         exclude = glob([
+            "src/**/*_bench.cpp",
             "src/**/*_test.cpp",
         ]),
     ) + select({
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfc57a5..1119719 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,33 +21,60 @@
 ###########################################################
 # Options
 ###########################################################
-option(MARL_WARNINGS_AS_ERRORS "Treat warnings as errors" OFF)
-option(MARL_BUILD_EXAMPLES "Build example applications" OFF)
-option(MARL_BUILD_TESTS "Build tests" OFF)
-option(MARL_ASAN "Build marl with address sanitizer" OFF)
-option(MARL_MSAN "Build marl with memory sanitizer" OFF)
-option(MARL_TSAN "Build marl with thread sanitizer" OFF)
-option(MARL_INSTALL "Create marl install target" OFF)
+function (option_if_not_defined name description default)
+    if(NOT DEFINED ${name})
+        option(${name} ${description} ${default})
+    endif()
+endfunction()
+
+option_if_not_defined(MARL_WARNINGS_AS_ERRORS "Treat warnings as errors" OFF)
+option_if_not_defined(MARL_BUILD_EXAMPLES "Build example applications" OFF)
+option_if_not_defined(MARL_BUILD_TESTS "Build tests" OFF)
+option_if_not_defined(MARL_BUILD_BENCHMARKS "Build benchmarks" OFF)
+option_if_not_defined(MARL_ASAN "Build marl with address sanitizer" OFF)
+option_if_not_defined(MARL_MSAN "Build marl with memory sanitizer" OFF)
+option_if_not_defined(MARL_TSAN "Build marl with thread sanitizer" OFF)
+option_if_not_defined(MARL_INSTALL "Create marl install target" OFF)
 
 ###########################################################
 # Directories
 ###########################################################
+function (set_if_not_defined name value)
+    if(NOT DEFINED ${name})
+        set(${name} ${value} PARENT_SCOPE)
+    endif()
+endfunction()
+
 set(MARL_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
 set(MARL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
-set(THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
-set(GOOGLETEST_DIR ${THIRD_PARTY_DIR}/googletest)
+set_if_not_defined(MARL_THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
+set_if_not_defined(MARL_GOOGLETEST_DIR ${MARL_THIRD_PARTY_DIR}/googletest)
+set_if_not_defined(MARL_BENCHMARK_DIR ${MARL_THIRD_PARTY_DIR}/benchmark)
 
 ###########################################################
 # Submodules
 ###########################################################
 if(MARL_BUILD_TESTS)
-    if(NOT EXISTS ${THIRD_PARTY_DIR}/googletest/.git)
+    if(NOT EXISTS ${MARL_GOOGLETEST_DIR}/.git)
         message(WARNING "third_party/googletest submodule missing.")
         message(WARNING "Run: `git submodule update --init` to build tests.")
         set(MARL_BUILD_TESTS OFF)
     endif()
 endif(MARL_BUILD_TESTS)
 
+if(MARL_BUILD_BENCHMARKS)
+    if(NOT EXISTS ${MARL_BENCHMARK_DIR}/.git)
+        message(WARNING "third_party/benchmark submodule missing.")
+        message(WARNING "Run: `git submodule update --init` to build benchmarks.")
+        set(MARL_BUILD_BENCHMARKS OFF)
+    endif()
+endif(MARL_BUILD_BENCHMARKS)
+
+if(MARL_BUILD_BENCHMARKS)
+    set(BENCHMARK_ENABLE_TESTING FALSE CACHE BOOL FALSE FORCE)
+    add_subdirectory(${MARL_BENCHMARK_DIR})
+endif(MARL_BUILD_BENCHMARKS)
+
 ###########################################################
 # File lists
 ###########################################################
@@ -64,9 +91,11 @@
         ${MARL_SRC_DIR}/osfiber_arm.c
         ${MARL_SRC_DIR}/osfiber_asm_aarch64.S
         ${MARL_SRC_DIR}/osfiber_asm_arm.S
+        ${MARL_SRC_DIR}/osfiber_asm_mips64.S
         ${MARL_SRC_DIR}/osfiber_asm_ppc64.S
         ${MARL_SRC_DIR}/osfiber_asm_x64.S
         ${MARL_SRC_DIR}/osfiber_asm_x86.S
+        ${MARL_SRC_DIR}/osfiber_mips64.c
         ${MARL_SRC_DIR}/osfiber_ppc64.c
         ${MARL_SRC_DIR}/osfiber_x64.c
         ${MARL_SRC_DIR}/osfiber_x86.c
@@ -87,7 +116,6 @@
 ###########################################################
 # Functions
 ###########################################################
-
 function(marl_set_target_options target)
     # Enable all warnings
     if(MSVC)
@@ -124,7 +152,7 @@
         target_link_libraries(${target} "-fsanitize=thread")
     endif()
 
-    target_include_directories(${target} PRIVATE ${MARL_INCLUDE_DIR})
+    target_include_directories(${target} PUBLIC ${MARL_INCLUDE_DIR})
 endfunction(marl_set_target_options)
 
 ###########################################################
@@ -181,13 +209,13 @@
         ${MARL_SRC_DIR}/scheduler_test.cpp
         ${MARL_SRC_DIR}/ticket_test.cpp
         ${MARL_SRC_DIR}/waitgroup_test.cpp
-        ${GOOGLETEST_DIR}/googletest/src/gtest-all.cc
+        ${MARL_GOOGLETEST_DIR}/googletest/src/gtest-all.cc
     )
 
     set(MARL_TEST_INCLUDE_DIR
-        ${GOOGLETEST_DIR}/googletest/include/
-        ${GOOGLETEST_DIR}/googlemock/include/
-        ${GOOGLETEST_DIR}/googletest/
+        ${MARL_GOOGLETEST_DIR}/googletest/include/
+        ${MARL_GOOGLETEST_DIR}/googlemock/include/
+        ${MARL_GOOGLETEST_DIR}/googletest/
     )
 
     add_executable(marl-unittests ${MARL_TEST_LIST})
@@ -199,9 +227,29 @@
 
     marl_set_target_options(marl-unittests)
 
-    target_link_libraries(marl-unittests marl "${MARL_OS_LIBS}")
+    target_link_libraries(marl-unittests marl)
 endif(MARL_BUILD_TESTS)
 
+# benchmarks
+if(MARL_BUILD_BENCHMARKS)
+    set(MARL_BENCHMARK_LIST
+        ${MARL_SRC_DIR}/blockingcall_bench.cpp
+        ${MARL_SRC_DIR}/defer_bench.cpp
+        ${MARL_SRC_DIR}/event_bench.cpp
+        ${MARL_SRC_DIR}/marl_bench.cpp
+        ${MARL_SRC_DIR}/non_marl_bench.cpp
+        ${MARL_SRC_DIR}/scheduler_bench.cpp
+        ${MARL_SRC_DIR}/ticket_bench.cpp
+        ${MARL_SRC_DIR}/waitgroup_bench.cpp
+    )
+
+    add_executable(marl-benchmarks ${MARL_BENCHMARK_LIST})
+
+    marl_set_target_options(marl-benchmarks)
+
+    target_link_libraries(marl-benchmarks benchmark::benchmark marl)
+endif(MARL_BUILD_BENCHMARKS)
+
 # examples
 if(MARL_BUILD_EXAMPLES)
     function(build_example target)
@@ -210,10 +258,10 @@
             FOLDER "Examples"
         )
         marl_set_target_options(${target})
-        target_link_libraries(${target} marl "${MARL_OS_LIBS}")
+        target_link_libraries(${target} marl)
     endfunction(build_example)
 
     build_example(fractal)
+    build_example(hello_task)
     build_example(primes)
-
 endif(MARL_BUILD_EXAMPLES)
diff --git a/README.md b/README.md
index 8e847ca..da1f5a1 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,57 @@
 
 Marl uses a combination of fibers and threads to allow efficient execution of tasks that can block, while keeping a fixed number of hardware threads.
 
-Marl supports Windows, macOS, Linux, Fuchsia and Android (arm, aarch64, ppc64 (ELFv2), x86 and x64).
+Marl supports Windows, macOS, Linux, Fuchsia and Android (arm, aarch64, mips64, ppc64 (ELFv2), x86 and x64).
 
 Marl has no dependencies on other libraries (with an exception on googletest for building the optional unit tests).
 
+Example:
+
+```cpp
+#include "marl/defer.h"
+#include "marl/event.h"
+#include "marl/scheduler.h"
+
+#include <cstdio>
+
+int main() {
+  // Create a marl scheduler using the 4 hardware threads.
+  // Bind this scheduler to the main thread so we can call marl::schedule()
+  marl::Scheduler scheduler;
+  scheduler.bind();
+  scheduler.setWorkerThreadCount(4);
+  defer(scheduler.unbind());  // Automatically unbind before returning.
+
+  // Create an event that automatically resets itself.
+  marl::Event sayHellow(marl::Event::Mode::Auto);
+  marl::Event saidHellow(marl::Event::Mode::Auto);
+
+  // Schedule some tasks to run asynchronously.
+  for (int i = 0; i < 10; i++) {
+    // Each task will run on one of the 4 worker threads.
+    marl::schedule([=] {  // All marl primitives are capture-by-value.
+      printf("Task %d waiting to say hello!\n", i);
+
+      // Blocking in a task?
+      // The scheduler will find something else for this thread to do.
+      sayHellow.wait();
+
+      printf("Hello from task %d!\n", i);
+
+      saidHellow.signal();
+    });
+  }
+
+  // Unblock the tasks one by one.
+  for (int i = 0; i < 10; i++) {
+    sayHellow.signal();
+    saidHellow.wait();
+  }
+
+  // All tasks are guaranteed to completed before the scheduler is destructed.
+}
+```
+
 ## Building
 
 Marl contains many unit tests and examples that can be built using CMake.
@@ -55,10 +102,13 @@
 target_link_libraries(<target> marl) # replace <target> with the name of your project's target
 ```
 
-You will also want to add the `marl` public headers to your project's include search paths so you can `#include` the marl headers:
+You may also wish to specify your own paths to the third party libraries used by `marl`.
+You can do this by setting any of the following variables before the call to `add_subdirectory()`:
 
 ```cmake
-target_include_directories($<target> PRIVATE "${MARL_DIR}/include") # replace <target> with the name of your project's target
+set(MARL_THIRD_PARTY_DIR <third-party-root-directory>) # defaults to ${MARL_DIR}/third_party
+set(MARL_GOOGLETEST_DIR  <path-to-googletest>)         # defaults to ${MARL_THIRD_PARTY_DIR}/googletest
+add_subdirectory(${MARL_DIR})
 ```
 
 ---
diff --git a/examples/BUILD.bazel b/examples/BUILD.bazel
index 5d68d73..d06f4b8 100644
--- a/examples/BUILD.bazel
+++ b/examples/BUILD.bazel
@@ -13,21 +13,31 @@
 # limitations under the License.
 
 cc_binary(
-  name = "fractal",
-  srcs = [
-    "fractal.cpp",
-  ],
-  deps = [
-    "//:marl",
-  ],
+    name = "fractal",
+    srcs = [
+        "fractal.cpp",
+    ],
+    deps = [
+        "//:marl",
+    ],
 )
 
 cc_binary(
-  name = "primes",
-  srcs = [
-    "primes.cpp",
-  ],
-  deps = [
-    "//:marl",
-  ],
+    name = "hello_task",
+    srcs = [
+        "hello_task.cpp",
+    ],
+    deps = [
+        "//:marl",
+    ],
+)
+
+cc_binary(
+    name = "primes",
+    srcs = [
+        "primes.cpp",
+    ],
+    deps = [
+        "//:marl",
+    ],
 )
diff --git a/examples/fractal.cpp b/examples/fractal.cpp
index af20dd4..4d1b4ba 100644
--- a/examples/fractal.cpp
+++ b/examples/fractal.cpp
@@ -146,10 +146,7 @@
 constexpr float cx = -0.8f;
 constexpr float cy = 0.156f;
 
-int main(int argc, const char** argv) {
-  (void)argc;  // unused parameter
-  (void)argv;  // unused parameter
-
+int main() {
   // Create a marl scheduler using the full number of logical cpus.
   // Bind this scheduler to the main thread so we can call marl::schedule()
   marl::Scheduler scheduler;
diff --git a/examples/hello_task.cpp b/examples/hello_task.cpp
new file mode 100644
index 0000000..139406c
--- /dev/null
+++ b/examples/hello_task.cpp
@@ -0,0 +1,58 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Simple "hello world" example that uses marl::Event.
+
+#include "marl/defer.h"
+#include "marl/event.h"
+#include "marl/scheduler.h"
+
+#include <cstdio>
+
+int main() {
+  // Create a marl scheduler using the 4 hardware threads.
+  // Bind this scheduler to the main thread so we can call marl::schedule()
+  marl::Scheduler scheduler;
+  scheduler.bind();
+  scheduler.setWorkerThreadCount(4);
+  defer(scheduler.unbind());  // Automatically unbind before returning.
+
+  // Create an event that automatically resets itself.
+  marl::Event sayHellow(marl::Event::Mode::Auto);
+  marl::Event saidHellow(marl::Event::Mode::Auto);
+
+  // Schedule some tasks to run asynchronously.
+  for (int i = 0; i < 10; i++) {
+    // Each task will run on one of the 4 worker threads.
+    marl::schedule([=] {  // All marl primitives are capture-by-value.
+      printf("Task %d waiting to say hello!\n", i);
+
+      // Blocking in a task?
+      // The scheduler will find something else for this thread to do.
+      sayHellow.wait();
+
+      printf("Hello from task %d!\n", i);
+
+      saidHellow.signal();
+    });
+  }
+
+  // Unblock the tasks one by one.
+  for (int i = 0; i < 10; i++) {
+    sayHellow.signal();
+    saidHellow.wait();
+  }
+
+  // All tasks are guaranteed to completed before the scheduler is destructed.
+}
diff --git a/examples/primes.cpp b/examples/primes.cpp
index e03dd47..baa8998 100644
--- a/examples/primes.cpp
+++ b/examples/primes.cpp
@@ -39,10 +39,7 @@
   return true;
 }
 
-int main(int argc, const char** argv) {
-  (void)argc;  // unused parameter
-  (void)argv;  // unused parameter
-
+int main() {
   // Create a marl scheduler using the full number of logical cpus.
   // Bind this scheduler to the main thread so we can call marl::schedule()
   marl::Scheduler scheduler;
diff --git a/include/marl/conditionvariable.h b/include/marl/conditionvariable.h
index c5b6787..daa7c59 100644
--- a/include/marl/conditionvariable.h
+++ b/include/marl/conditionvariable.h
@@ -76,13 +76,12 @@
   if (numWaiting == 0) {
     return;
   }
-  std::unique_lock<std::mutex> lock(mutex);
-  for (auto fiber : waiting) {
-    fiber->schedule();
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    for (auto fiber : waiting) {
+      fiber->notify();
+    }
   }
-  waiting.clear();
-  lock.unlock();
-
   if (numWaitingOnCondition > 0) {
     condition.notify_one();
   }
@@ -92,13 +91,12 @@
   if (numWaiting == 0) {
     return;
   }
-  std::unique_lock<std::mutex> lock(mutex);
-  for (auto fiber : waiting) {
-    fiber->schedule();
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    for (auto fiber : waiting) {
+      fiber->notify();
+    }
   }
-  waiting.clear();
-  lock.unlock();
-
   if (numWaitingOnCondition > 0) {
     condition.notify_all();
   }
@@ -114,15 +112,15 @@
   if (auto fiber = Scheduler::Fiber::current()) {
     // Currently executing on a scheduler fiber.
     // Yield to let other tasks run that can unblock this fiber.
-    while (!pred()) {
-      mutex.lock();
-      waiting.emplace(fiber);
-      mutex.unlock();
+    mutex.lock();
+    waiting.emplace(fiber);
+    mutex.unlock();
 
-      lock.unlock();
-      fiber->yield();
-      lock.lock();
-    }
+    fiber->wait(lock, pred);
+
+    mutex.lock();
+    waiting.erase(fiber);
+    mutex.unlock();
   } else {
     // Currently running outside of the scheduler.
     // Delegate to the std::condition_variable.
@@ -155,23 +153,17 @@
   if (auto fiber = Scheduler::Fiber::current()) {
     // Currently executing on a scheduler fiber.
     // Yield to let other tasks run that can unblock this fiber.
-    while (!pred()) {
-      mutex.lock();
-      waiting.emplace(fiber);
-      mutex.unlock();
+    mutex.lock();
+    waiting.emplace(fiber);
+    mutex.unlock();
 
-      lock.unlock();
-      fiber->yield_until(timeout);
-      lock.lock();
+    auto res = fiber->wait(lock, timeout, pred);
 
-      if (std::chrono::system_clock::now() >= timeout) {
-        mutex.lock();
-        waiting.erase(fiber);
-        mutex.unlock();
-        return false;
-      }
-    }
-    return true;
+    mutex.lock();
+    waiting.erase(fiber);
+    mutex.unlock();
+
+    return res;
   } else {
     // Currently running outside of the scheduler.
     // Delegate to the std::condition_variable.
diff --git a/include/marl/sal.h b/include/marl/sal.h
index c942eee..de47a49 100644
--- a/include/marl/sal.h
+++ b/include/marl/sal.h
@@ -19,12 +19,24 @@
 #ifndef marl_sal_h
 #define marl_sal_h
 
-#ifndef _Requires_lock_held_
-#define _Requires_lock_held_(x)
+#ifndef _Acquires_lock_
+#define _Acquires_lock_(...)
 #endif
 
-#ifndef _Requires_lock_not_held_
-#define _Requires_lock_not_held_(x)
+#ifndef _Guarded_by_
+#define _Guarded_by_(...)
+#endif
+
+#ifndef _Releases_lock_
+#define _Releases_lock_(...)
+#endif
+
+#ifndef _Requires_lock_held_
+#define _Requires_lock_held_(...)
+#endif
+
+#ifndef _When_
+#define _When_(...)
 #endif
 
 #endif  // marl_sal_h
diff --git a/include/marl/scheduler.h b/include/marl/scheduler.h
index ffcddc3..8f5532d 100644
--- a/include/marl/scheduler.h
+++ b/include/marl/scheduler.h
@@ -50,6 +50,7 @@
 
  public:
   using TimePoint = std::chrono::system_clock::time_point;
+  using Predicate = std::function<bool()>;
 
   Scheduler(Allocator* allocator = Allocator::Default);
   ~Scheduler();
@@ -98,25 +99,49 @@
   // thread that previously executed it.
   class Fiber {
    public:
+    using Lock = std::unique_lock<std::mutex>;
+
     // current() returns the currently executing fiber, or nullptr if called
     // without a bound scheduler.
     static Fiber* current();
 
-    // yield() suspends execution of this Fiber, allowing the thread to work
-    // on other tasks.
-    // yield() must only be called on the currently executing fiber.
-    void yield();
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify() and the predicate pred returns true.
+    // If the predicate pred does not return true when notify() is called, then
+    // the Fiber is automatically re-suspended, and will need to be woken with
+    // another call to notify().
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // lock must be locked before calling, and is unlocked by wait() just before
+    // the Fiber is suspended, and re-locked before the fiber is resumed. lock
+    // will be locked before wait() returns.
+    // pred will be always be called with the lock held.
+    // wait() must only be called on the currently executing fiber.
+    void wait(Lock& lock, const Predicate& pred);
 
-    // yield_until() suspends execution of this Fiber, allowing the thread to
-    // work on other tasks. yield_until() may automatically resume sometime
-    // after timeout.
-    // yield_until() must only be called on the currently executing fiber.
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify() and the predicate pred returns true, or sometime after
+    // the timeout is reached.
+    // If the predicate pred does not return true when notify() is called, then
+    // the Fiber is automatically re-suspended, and will need to be woken with
+    // another call to notify() or will be woken sometime after the timeout is
+    // reached.
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // lock must be locked before calling, and is unlocked by wait() just before
+    // the Fiber is suspended, and re-locked before the fiber is resumed. lock
+    // will be locked before wait() returns.
+    // pred will be always be called with the lock held.
+    // wait() must only be called on the currently executing fiber.
     template <typename Clock, typename Duration>
-    inline void yield_until(
-        const std::chrono::time_point<Clock, Duration>& timeout);
+    inline bool wait(Lock& lock,
+                     const std::chrono::time_point<Clock, Duration>& timeout,
+                     const Predicate& pred);
 
-    // schedule() reschedules the suspended Fiber for execution.
-    void schedule();
+    // notify() reschedules the suspended Fiber for execution.
+    // notify() is usually only called when the predicate for one or more wait()
+    // calls will likely return true.
+    void notify();
 
     // id is the thread-unique identifier of the Fiber.
     uint32_t const id;
@@ -125,9 +150,28 @@
     friend class Allocator;
     friend class Scheduler;
 
-    Fiber(Allocator::unique_ptr<OSFiber>&&, uint32_t id);
+    enum class State {
+      // Idle: the Fiber is currently unused, and sits in Worker::idleFibers,
+      // ready to be recycled.
+      Idle,
 
-    void yield_until_sc(const TimePoint& timeout);
+      // Yielded: the Fiber is currently blocked on a wait() call with no
+      // timeout.
+      Yielded,
+
+      // Waiting: the Fiber is currently blocked on a wait() call with a
+      // timeout. The fiber is stilling in the Worker::Work::waiting queue.
+      Waiting,
+
+      // Queued: the Fiber is currently queued for execution in the
+      // Worker::Work::fibers queue.
+      Queued,
+
+      // Running: the Fiber is currently executing.
+      Running,
+    };
+
+    Fiber(Allocator::unique_ptr<OSFiber>&&, uint32_t id);
 
     // switchTo() switches execution to the given fiber.
     // switchTo() must only be called on the currently executing fiber.
@@ -147,8 +191,13 @@
         Allocator* allocator,
         uint32_t id);
 
+    // toString() returns a string representation of the given State.
+    // Used for debugging.
+    static const char* toString(State state);
+
     Allocator::unique_ptr<OSFiber> const impl;
     Worker* const worker;
+    State state = State::Running;  // Guarded by Worker's work.mutex.
   };
 
  private:
@@ -178,6 +227,9 @@
     // erase() removes the fiber from the waiting list.
     inline void erase(Fiber* fiber);
 
+    // contains() returns true if fiber is waiting.
+    inline bool contains(Fiber* fiber) const;
+
    private:
     struct Timeout {
       TimePoint timepoint;
@@ -216,11 +268,18 @@
     // tasks have fully finished.
     void stop();
 
-    // yield() suspends execution of the current task, and looks for other
-    // tasks to start or continue execution.
-    // If timeout is not nullptr, yield may automatically resume the current
-    // task sometime after timeout.
-    void yield(Fiber* fiber, const TimePoint* timeout);
+    // wait() suspends execution of the current task until the predicate pred
+    // returns true.
+    // See Fiber::wait() for more information.
+    bool wait(Fiber::Lock& lock,
+              const TimePoint* timeout,
+              const Predicate& pred);
+
+    // suspend() suspends the currenetly executing Fiber until the fiber is
+    // woken with a call to enqueue(Fiber*), or automatically sometime after the
+    // optional timeout.
+    _Requires_lock_held_(work.mutex)
+    void suspend(const TimePoint* timeout);
 
     // enqueue(Fiber*) enqueues resuming of a suspended fiber.
     void enqueue(Fiber* fiber);
@@ -231,10 +290,13 @@
     // tryLock() attempts to lock the worker for task enqueing.
     // If the lock was successful then true is returned, and the caller must
     // call enqueueAndUnlock().
+    _When_(return == true, _Acquires_lock_(work.mutex))
     bool tryLock();
 
     // enqueueAndUnlock() enqueues the task and unlocks the worker.
     // Must only be called after a call to tryLock() which returned true.
+    _Requires_lock_held_(work.mutex)
+    _Releases_lock_(work.mutex)
     void enqueueAndUnlock(Task&& task);
 
     // flush() processes all pending tasks before returning.
@@ -271,13 +333,13 @@
     void switchToFiber(Fiber*);
 
     // runUntilIdle() executes all pending tasks and then returns.
-    _Requires_lock_held_(lock) void runUntilIdle(
-        std::unique_lock<std::mutex>& lock);
+    _Requires_lock_held_(work.mutex)
+    void runUntilIdle();
 
     // waitForWork() blocks until new work is available, potentially calling
     // spinForWork().
-    _Requires_lock_held_(lock) void waitForWork(
-        std::unique_lock<std::mutex>& lock);
+    _Requires_lock_held_(work.mutex)
+    void waitForWork();
 
     // spinForWork() attempts to steal work from another Worker, and keeps
     // the thread awake for a short duration. This reduces overheads of
@@ -286,20 +348,29 @@
 
     // enqueueFiberTimeouts() enqueues all the fibers that have finished
     // waiting.
-    _Requires_lock_held_(lock) void enqueueFiberTimeouts();
+    _Requires_lock_held_(work.mutex)
+    void enqueueFiberTimeouts();
+
+    _Requires_lock_held_(work.mutex)
+    inline void changeFiberState(Fiber* fiber,
+                                 Fiber::State from,
+                                 Fiber::State to) const;
+
+    _Requires_lock_held_(work.mutex)
+    inline void setFiberState(Fiber* fiber, Fiber::State to) const;
 
     // numBlockedFibers() returns the number of fibers currently blocked and
     // held externally.
-    _Requires_lock_held_(lock) inline size_t numBlockedFibers() const {
+    inline size_t numBlockedFibers() const {
       return workerFibers.size() - idleFibers.size();
     }
 
     // Work holds tasks and fibers that are enqueued on the Worker.
     struct Work {
       std::atomic<uint64_t> num = {0};  // tasks.size() + fibers.size()
-      TaskQueue tasks;                  // guarded by mutex
-      FiberQueue fibers;                // guarded by mutex
-      WaitingFibers waiting;            // guarded by mutex
+      _Guarded_by_(mutex) TaskQueue tasks;
+      _Guarded_by_(mutex) FiberQueue fibers;
+      _Guarded_by_(mutex) WaitingFibers waiting;
       std::condition_variable added;
       std::mutex mutex;
     };
@@ -366,11 +437,14 @@
 };
 
 template <typename Clock, typename Duration>
-void Scheduler::Fiber::yield_until(
-    const std::chrono::time_point<Clock, Duration>& timeout) {
+bool Scheduler::Fiber::wait(
+    Lock& lock,
+    const std::chrono::time_point<Clock, Duration>& timeout,
+    const Predicate& pred) {
   using ToDuration = typename TimePoint::duration;
   using ToClock = typename TimePoint::clock;
-  yield_until_sc(std::chrono::time_point_cast<ToDuration, ToClock>(timeout));
+  auto tp = std::chrono::time_point_cast<ToDuration, ToClock>(timeout);
+  return worker->wait(lock, &tp, pred);
 }
 
 Scheduler::Worker* Scheduler::Worker::getCurrent() {
diff --git a/kokoro/macos/presubmit.sh b/kokoro/macos/presubmit.sh
index ea97913..da16d77 100755
--- a/kokoro/macos/presubmit.sh
+++ b/kokoro/macos/presubmit.sh
@@ -27,7 +27,7 @@
     sh bazel-0.29.1-installer-darwin-x86_64.sh --prefix=$BUILD_ROOT/bazel
     rm bazel-0.29.1-installer-darwin-x86_64.sh
     # Build and run
-    $BUILD_ROOT/bazel/bin/bazel test //:tests
+    $BUILD_ROOT/bazel/bin/bazel test //:tests --test_output=all
     $BUILD_ROOT/bazel/bin/bazel run //examples:fractal
     $BUILD_ROOT/bazel/bin/bazel run //examples:primes > /dev/null
 else
diff --git a/kokoro/ubuntu/presubmit.sh b/kokoro/ubuntu/presubmit.sh
index 5ab2948..9e19fb0 100755
--- a/kokoro/ubuntu/presubmit.sh
+++ b/kokoro/ubuntu/presubmit.sh
@@ -38,7 +38,7 @@
     bash bazel-0.29.1-installer-linux-x86_64.sh --prefix=$BUILD_ROOT/bazel
     rm bazel-0.29.1-installer-linux-x86_64.sh
     # Build and run
-    $BUILD_ROOT/bazel/bin/bazel test //:tests
+    $BUILD_ROOT/bazel/bin/bazel test //:tests --test_output=all
     $BUILD_ROOT/bazel/bin/bazel run //examples:fractal
     $BUILD_ROOT/bazel/bin/bazel run //examples:primes > /dev/null
 else
diff --git a/src/blockingcall_bench.cpp b/src/blockingcall_bench.cpp
new file mode 100644
index 0000000..0640816
--- /dev/null
+++ b/src/blockingcall_bench.cpp
@@ -0,0 +1,24 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/blockingcall.h"
+
+#include "benchmark/benchmark.h"
+
+static void BlockingCall(benchmark::State& state) {
+  for (auto _ : state) {
+    marl::blocking_call([] {});
+  }
+}
+BENCHMARK(BlockingCall);
diff --git a/src/defer_bench.cpp b/src/defer_bench.cpp
new file mode 100644
index 0000000..c89dc32
--- /dev/null
+++ b/src/defer_bench.cpp
@@ -0,0 +1,25 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/defer.h"
+
+#include "benchmark/benchmark.h"
+
+static void Defer(benchmark::State& state) {
+  int i = 0;
+  for (auto _ : state) {
+    defer(benchmark::DoNotOptimize(i++));
+  }
+}
+BENCHMARK(Defer);
diff --git a/src/event_bench.cpp b/src/event_bench.cpp
new file mode 100644
index 0000000..985c718
--- /dev/null
+++ b/src/event_bench.cpp
@@ -0,0 +1,38 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/event.h"
+
+#include "benchmark/benchmark.h"
+
+#include <vector>
+
+BENCHMARK_DEFINE_F(Schedule, Event)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      std::vector<marl::Event> events(numTasks + 1);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] {
+          events[i].wait();
+          events[i + 1].signal();
+        });
+      }
+      events.front().signal();
+      events.back().wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Event)->Apply(Schedule::args<512>);
diff --git a/src/event_test.cpp b/src/event_test.cpp
index aa4fdbe..d00e721 100644
--- a/src/event_test.cpp
+++ b/src/event_test.cpp
@@ -191,19 +191,20 @@
   wg.wait();
 }
 
-// EventWaitStressTest spins up a whole lot of wait_fors(), unblocks them early,
-// and then let's all the workers go to idle before repeating.
+// EventWaitStressTest spins up a whole lot of wait_fors(), unblocking some
+// with timeouts and some with an event signal, and then let's all the workers
+// go to idle before repeating.
 // This is testing to ensure that the scheduler handles timeouts correctly when
 // they are early-unblocked. Specifically, this is to test that fibers are
 // not double-placed into the idle or working lists.
 TEST_P(WithBoundScheduler, EventWaitStressTest) {
   auto event = marl::Event(marl::Event::Mode::Manual);
   for (int i = 0; i < 10; i++) {
-    auto wg = marl::WaitGroup(1000);
-    for (int j = 0; j < 1000; j++) {
+    auto wg = marl::WaitGroup(100);
+    for (int j = 0; j < 100; j++) {
       marl::schedule([=] {
         defer(wg.done());
-        event.wait_for(std::chrono::milliseconds(100));
+        event.wait_for(std::chrono::milliseconds(j));
       });
     }
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
diff --git a/src/marl_bench.cpp b/src/marl_bench.cpp
new file mode 100644
index 0000000..f258daf
--- /dev/null
+++ b/src/marl_bench.cpp
@@ -0,0 +1,27 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+BENCHMARK_MAIN();
+
+uint32_t Schedule::doSomeWork(uint32_t x) {
+  uint32_t q = x;
+  for (uint32_t i = 0; i < 100000; i++) {
+    x = (x << 4) | x;
+    x = x | 0x1020;
+    x = (x >> 2) & q;
+  }
+  return x;
+}
\ No newline at end of file
diff --git a/src/marl_bench.h b/src/marl_bench.h
new file mode 100644
index 0000000..4932b40
--- /dev/null
+++ b/src/marl_bench.h
@@ -0,0 +1,65 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+
+#include "benchmark/benchmark.h"
+
+class Schedule : public benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) {}
+
+  void TearDown(const ::benchmark::State&) {}
+
+  // run() creates a scheduler, sets the number of worker threads from the
+  // benchmark arguments, calls f, then unbinds and destructs the scheduler.
+  // F must be a function of the signature: void(int numTasks)
+  template <typename F>
+  void run(const ::benchmark::State& state, F&& f) {
+    marl::Scheduler scheduler;
+    scheduler.setWorkerThreadCount(numThreads(state));
+    scheduler.bind();
+    f(numTasks(state));
+    scheduler.unbind();
+  }
+
+  // args() sets up the benchmark to run from [1 .. NumTasks] tasks (in 8^n
+  // steps) across 0 worker threads to numLogicalCPUs.
+  template <int NumTasks = 0x40000>
+  static void args(benchmark::internal::Benchmark* b) {
+    b->ArgNames({"tasks", "threads"});
+    for (unsigned int tasks = 1U; tasks <= NumTasks; tasks *= 8) {
+      for (unsigned int threads = 0U; threads <= marl::Thread::numLogicalCPUs();
+           ++threads) {
+        b->Args({tasks, threads});
+      }
+    }
+  }
+
+  // numThreads return the number of threads in the benchmark run from the
+  // state.
+  static int numThreads(const ::benchmark::State& state) {
+    return static_cast<int>(state.range(1));
+  }
+
+  // numTasks return the number of tasks in the benchmark run from the state.
+  static int numTasks(const ::benchmark::State& state) {
+    return static_cast<int>(state.range(0));
+  }
+
+  // doSomeWork() performs some made up bit-shitfy algorithm that's difficult
+  // for a compiler to optimize and produces consistent results.
+  static uint32_t doSomeWork(uint32_t x);
+};
\ No newline at end of file
diff --git a/src/non_marl_bench.cpp b/src/non_marl_bench.cpp
new file mode 100644
index 0000000..ab1ea9a
--- /dev/null
+++ b/src/non_marl_bench.cpp
@@ -0,0 +1,167 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains a number of benchmarks that do not use marl.
+// They exist to compare marl's performance against other simple scheduler
+// approaches.
+
+#include "marl_bench.h"
+
+#include "benchmark/benchmark.h"
+
+#include <mutex>
+#include <queue>
+#include <thread>
+
+namespace {
+
+// Event provides a basic wait-and-signal synchronization primitive.
+class Event {
+ public:
+  // wait blocks until the event is fired.
+  void wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [&] { return signalled_; });
+  }
+
+  // signal signals the Event, unblocking any calls to wait.
+  void signal() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    signalled_ = true;
+    cv_.notify_all();
+  }
+
+ private:
+  std::condition_variable cv_;
+  std::mutex mutex_;
+  bool signalled_ = false;
+};
+
+}  // anonymous namespace
+
+// A simple multi-thread, single-queue task executor that shares a single mutex
+// across N threads. This implementation suffers from lock contention.
+static void SingleQueueTaskExecutor(benchmark::State& state) {
+  using Task = std::function<uint32_t(uint32_t)>;
+
+  auto const numTasks = Schedule::numTasks(state);
+  auto const numThreads = Schedule::numThreads(state);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::mutex mutex;
+    // Set everything up with the mutex locked to prevent the threads from
+    // performing work while the timing is paused.
+    mutex.lock();
+
+    // Set up the tasks.
+    std::queue<Task> tasks;
+    for (int i = 0; i < numTasks; i++) {
+      tasks.push(Schedule::doSomeWork);
+    }
+
+    auto taskRunner = [&] {
+      while (true) {
+        Task task;
+
+        // Take the next task.
+        // Note that this lock is likely to block while waiting for other
+        // threads.
+        mutex.lock();
+        if (tasks.size() > 0) {
+          task = tasks.front();
+          tasks.pop();
+        }
+        mutex.unlock();
+
+        if (task) {
+          task(123);
+        } else {
+          return;  // done.
+        }
+      }
+    };
+
+    // Set up the threads.
+    std::vector<std::thread> threads;
+    for (int i = 0; i < numThreads; i++) {
+      threads.emplace_back(std::thread(taskRunner));
+    }
+
+    state.ResumeTiming();
+    mutex.unlock();  // Go threads, go!
+
+    if (numThreads > 0) {
+      // Wait for all threads to finish.
+      for (auto& thread : threads) {
+        thread.join();
+      }
+    } else {
+      // Single-threaded test - just run the worker.
+      taskRunner();
+    }
+  }
+}
+BENCHMARK(SingleQueueTaskExecutor)->Apply(Schedule::args);
+
+// A simple multi-thread, multi-queue task executor that avoids lock contention.
+// Tasks queues are evenly balanced, and each should take an equal amount of
+// time to execute.
+static void MultiQueueTaskExecutor(benchmark::State& state) {
+  using Task = std::function<uint32_t(uint32_t)>;
+  using TaskQueue = std::vector<Task>;
+
+  auto const numTasks = Schedule::numTasks(state);
+  auto const numThreads = Schedule::numThreads(state);
+  auto const numQueues = std::max(numThreads, 1);
+
+  // Set up the tasks queues.
+  std::vector<TaskQueue> taskQueues(numQueues);
+  for (int i = 0; i < numTasks; i++) {
+    taskQueues[i % numQueues].emplace_back(Schedule::doSomeWork);
+  }
+
+  for (auto _ : state) {
+    if (numThreads > 0) {
+      state.PauseTiming();
+      Event start;
+
+      // Set up the threads.
+      std::vector<std::thread> threads;
+      for (int i = 0; i < numThreads; i++) {
+        threads.emplace_back(std::thread([&, i] {
+          start.wait();
+          for (auto& task : taskQueues[i]) {
+            task(123);
+          }
+        }));
+      }
+
+      state.ResumeTiming();
+      start.signal();
+
+      // Wait for all threads to finish.
+      for (auto& thread : threads) {
+        thread.join();
+      }
+    } else {
+      // Single-threaded test - just run the tasks.
+      for (auto& task : taskQueues[0]) {
+        task(123);
+      }
+    }
+  }
+}
+BENCHMARK(MultiQueueTaskExecutor)->Apply(Schedule::args);
\ No newline at end of file
diff --git a/src/osfiber_asm.h b/src/osfiber_asm.h
index dc9d0af..c515ca6 100644
--- a/src/osfiber_asm.h
+++ b/src/osfiber_asm.h
@@ -32,6 +32,8 @@
 #include "osfiber_asm_arm.h"
 #elif defined(__powerpc64__)
 #include "osfiber_asm_ppc64.h"
+#elif defined(__mips__) && _MIPS_SIM == _ABI64
+#include "osfiber_asm_mips64.h"
 #else
 #error "Unsupported target"
 #endif
diff --git a/src/osfiber_asm_mips64.S b/src/osfiber_asm_mips64.S
new file mode 100644
index 0000000..005225e
--- /dev/null
+++ b/src/osfiber_asm_mips64.S
@@ -0,0 +1,86 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__mips__) && _MIPS_SIM == _ABI64
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_mips64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// a0: from
+// v0: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    sd  $s0, MARL_REG_s0($a0)
+    sd  $s1, MARL_REG_s1($a0)
+    sd  $s2, MARL_REG_s2($a0)
+    sd  $s3, MARL_REG_s3($a0)
+    sd  $s4, MARL_REG_s4($a0)
+    sd  $s5, MARL_REG_s5($a0)
+    sd  $s6, MARL_REG_s6($a0)
+    sd  $s7, MARL_REG_s7($a0)
+
+    s.d  $f24, MARL_REG_f24($a0)
+    s.d  $f25, MARL_REG_f25($a0)
+    s.d  $f26, MARL_REG_f26($a0)
+    s.d  $f27, MARL_REG_f27($a0)
+    s.d  $f28, MARL_REG_f28($a0)
+    s.d  $f29, MARL_REG_f29($a0)
+    s.d  $f31, MARL_REG_f30($a0)
+    s.d  $f31, MARL_REG_f31($a0)
+
+    sd  $gp, MARL_REG_gp($a0)
+    sd  $sp, MARL_REG_sp($a0)
+    sd  $fp, MARL_REG_fp($a0)
+    sd  $ra, MARL_REG_ra($a0)
+
+    move  $v0, $a1 // Function have no return, so safe to touch v0
+
+    // Recover callee-preserved registers
+    ld  $s0, MARL_REG_s0($v0)
+    ld  $s1, MARL_REG_s1($v0)
+    ld  $s2, MARL_REG_s2($v0)
+    ld  $s3, MARL_REG_s3($v0)
+    ld  $s4, MARL_REG_s4($v0)
+    ld  $s5, MARL_REG_s5($v0)
+    ld  $s6, MARL_REG_s6($v0)
+    ld  $s7, MARL_REG_s7($v0)
+
+    l.d  $f24, MARL_REG_f24($v0)
+    l.d  $f25, MARL_REG_f25($v0)
+    l.d  $f26, MARL_REG_f26($v0)
+    l.d  $f27, MARL_REG_f27($v0)
+    l.d  $f28, MARL_REG_f28($v0)
+    l.d  $f29, MARL_REG_f29($v0)
+    l.d  $f31, MARL_REG_f30($v0)
+    l.d  $f31, MARL_REG_f31($v0)
+
+    ld  $gp, MARL_REG_gp($v0)
+    ld  $sp, MARL_REG_sp($v0)
+    ld  $fp, MARL_REG_fp($v0)
+    ld  $ra, MARL_REG_ra($v0)
+
+    // Recover arguments
+    ld  $a0, MARL_REG_a0($v0)
+    ld  $a1, MARL_REG_a1($v0)
+
+    jr	$ra
+
+#endif // defined(__mips__) && _MIPS_SIM == _ABI64
diff --git a/src/osfiber_asm_mips64.h b/src/osfiber_asm_mips64.h
new file mode 100644
index 0000000..e444e1c
--- /dev/null
+++ b/src/osfiber_asm_mips64.h
@@ -0,0 +1,126 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_a0 0x00
+#define MARL_REG_a1 0x08
+#define MARL_REG_s0 0x10
+#define MARL_REG_s1 0x18
+#define MARL_REG_s2 0x20
+#define MARL_REG_s3 0x28
+#define MARL_REG_s4 0x30
+#define MARL_REG_s5 0x38
+#define MARL_REG_s6 0x40
+#define MARL_REG_s7 0x48
+#define MARL_REG_f24 0x50
+#define MARL_REG_f25 0x58
+#define MARL_REG_f26 0x60
+#define MARL_REG_f27 0x68
+#define MARL_REG_f28 0x70
+#define MARL_REG_f29 0x78
+#define MARL_REG_f30 0x80
+#define MARL_REG_f31 0x88
+#define MARL_REG_gp 0x90
+#define MARL_REG_sp 0x98
+#define MARL_REG_fp 0xa0
+#define MARL_REG_ra 0xa8
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // parameter registers (First two)
+  uintptr_t a0;
+  uintptr_t a1;
+
+  // callee-saved registers
+  uintptr_t s0;
+  uintptr_t s1;
+  uintptr_t s2;
+  uintptr_t s3;
+  uintptr_t s4;
+  uintptr_t s5;
+  uintptr_t s6;
+  uintptr_t s7;
+
+  uintptr_t f24;
+  uintptr_t f25;
+  uintptr_t f26;
+  uintptr_t f27;
+  uintptr_t f28;
+  uintptr_t f29;
+  uintptr_t f30;
+  uintptr_t f31;
+
+  uintptr_t gp;
+  uintptr_t sp;
+  uintptr_t fp;
+  uintptr_t ra;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, a0) == MARL_REG_a0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, a1) == MARL_REG_a1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s0) == MARL_REG_s0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s1) == MARL_REG_s1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s2) == MARL_REG_s2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s3) == MARL_REG_s3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s4) == MARL_REG_s4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s5) == MARL_REG_s5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s6) == MARL_REG_s6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s7) == MARL_REG_s7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f24) == MARL_REG_f24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f25) == MARL_REG_f25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f26) == MARL_REG_f26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f27) == MARL_REG_f27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f28) == MARL_REG_f28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f29) == MARL_REG_f29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f30) == MARL_REG_f30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f31) == MARL_REG_f31,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, gp) == MARL_REG_gp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, sp) == MARL_REG_sp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fp) == MARL_REG_fp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ra) == MARL_REG_ra,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
diff --git a/src/osfiber_mips64.c b/src/osfiber_mips64.c
new file mode 100644
index 0000000..baf7f7b
--- /dev/null
+++ b/src/osfiber_mips64.c
@@ -0,0 +1,35 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__mips__) && _MIPS_SIM == _ABI64
+
+#include "osfiber_asm_mips64.h"
+
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->ra = (uintptr_t)&marl_fiber_trampoline;
+  ctx->a0 = (uintptr_t)target;
+  ctx->a1 = (uintptr_t)arg;
+  ctx->sp = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif // defined(__mips__) && _MIPS_SIM == _ABI64
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index d352bef..cae3a6b 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -17,7 +17,6 @@
 #include "marl/scheduler.h"
 
 #include "marl/debug.h"
-#include "marl/defer.h"
 #include "marl/thread.h"
 #include "marl/trace.h"
 
@@ -28,14 +27,38 @@
 // Enable to trace scheduler events.
 #define ENABLE_TRACE_EVENTS 0
 
+// Enable to print verbose debug logging.
+#define ENABLE_DEBUG_LOGGING 0
+
 #if ENABLE_TRACE_EVENTS
 #define TRACE(...) MARL_SCOPED_EVENT(__VA_ARGS__)
 #else
 #define TRACE(...)
 #endif
 
+#if ENABLE_DEBUG_LOGGING
+#define DBG_LOG(msg, ...) \
+  printf("%.3x " msg "\n", (int)threadID() & 0xfff, __VA_ARGS__)
+#else
+#define DBG_LOG(msg, ...)
+#endif
+
+#define ASSERT_FIBER_STATE(FIBER, STATE)                                   \
+  MARL_ASSERT(FIBER->state == STATE,                                       \
+              "fiber %d was in state %s, but expected %s", (int)FIBER->id, \
+              Fiber::toString(FIBER->state), Fiber::toString(STATE))
+
 namespace {
 
+#if ENABLE_DEBUG_LOGGING
+// threadID() returns a uint64_t representing the currently executing thread.
+// threadID() is only intended to be used for debugging purposes.
+inline uint64_t threadID() {
+  auto id = std::this_thread::get_id();
+  return std::hash<std::thread::id>()(id);
+}
+#endif
+
 template <typename T>
 inline T take(std::queue<T>& queue) {
   auto out = std::move(queue.front());
@@ -78,7 +101,7 @@
   {
     std::unique_lock<std::mutex> lock(singleThreadedWorkerMutex);
     auto worker =
-        allocator->make_unique<Worker>(this, Worker::Mode::SingleThreaded, 0);
+        allocator->make_unique<Worker>(this, Worker::Mode::SingleThreaded, -1);
     worker->start();
     auto tid = std::this_thread::get_id();
     singleThreadedWorkers.emplace(tid, std::move(worker));
@@ -103,7 +126,7 @@
 }
 
 Scheduler::Scheduler(Allocator* allocator /* = Allocator::Default */)
-    : allocator(allocator) {
+    : allocator(allocator), workerThreads{} {
   for (size_t i = 0; i < spinningWorkers.size(); i++) {
     spinningWorkers[i] = -1;
   }
@@ -218,18 +241,12 @@
   return worker != nullptr ? worker->getCurrentFiber() : nullptr;
 }
 
-void Scheduler::Fiber::schedule() {
+void Scheduler::Fiber::notify() {
   worker->enqueue(this);
 }
 
-void Scheduler::Fiber::yield() {
-  MARL_SCOPED_EVENT("YIELD");
-  worker->yield(this, nullptr);
-}
-
-void Scheduler::Fiber::yield_until_sc(const TimePoint& timeout) {
-  MARL_SCOPED_EVENT("YIELD_UNTIL");
-  worker->yield(this, &timeout);
+void Scheduler::Fiber::wait(Lock& lock, const Predicate& pred) {
+  worker->wait(lock, nullptr, pred);
 }
 
 void Scheduler::Fiber::switchTo(Fiber* to) {
@@ -253,6 +270,23 @@
       OSFiber::createFiberFromCurrentThread(allocator), id);
 }
 
+const char* Scheduler::Fiber::toString(State state) {
+  switch (state) {
+    case State::Idle:
+      return "Idle";
+    case State::Yielded:
+      return "Yielded";
+    case State::Queued:
+      return "Queued";
+    case State::Running:
+      return "Running";
+    case State::Waiting:
+      return "Waiting";
+  }
+  MARL_ASSERT(false, "bad fiber state");
+  return "<unknown>";
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Scheduler::WaitingFibers
 ////////////////////////////////////////////////////////////////////////////////
@@ -300,6 +334,10 @@
   }
 }
 
+bool Scheduler::WaitingFibers::contains(Fiber* fiber) const {
+  return fibers.count(fiber) != 0;
+}
+
 bool Scheduler::WaitingFibers::Timeout::operator<(const Timeout& o) const {
   if (timepoint != o.timepoint) {
     return timepoint < o.timepoint;
@@ -363,51 +401,107 @@
   }
 }
 
-void Scheduler::Worker::yield(
-    Fiber* from,
+bool Scheduler::Worker::wait(Fiber::Lock& waitLock,
+                             const TimePoint* timeout,
+                             const Predicate& pred) {
+  DBG_LOG("%d: WAIT(%d)", (int)id, (int)currentFiber->id);
+  while (!pred()) {
+    // Lock the work mutex to call suspend().
+    work.mutex.lock();
+
+    // Unlock the wait mutex with the work mutex lock held.
+    // Order is important here as we need to ensure that the fiber is not
+    // enqueued (via Fiber::notify()) between the waitLock.unlock() and fiber
+    // switch, otherwise the Fiber::notify() call may be ignored and the fiber
+    // is never woken.
+    waitLock.unlock();
+
+    // suspend the fiber.
+    suspend(timeout);
+
+    // Fiber resumed. We don't need the work mutex locked any more.
+    work.mutex.unlock();
+
+    // Check timeout.
+    if (timeout != nullptr && std::chrono::system_clock::now() >= *timeout) {
+      return false;
+    }
+
+    // Spurious wake up. Re-lock, spin again.
+    waitLock.lock();
+  }
+  return true;
+}
+
+void Scheduler::Worker::suspend(
     const std::chrono::system_clock::time_point* timeout) {
-  MARL_ASSERT(currentFiber == from,
-              "Attempting to call yield from a non-current fiber");
-
   // Current fiber is yielding as it is blocked.
-
-  std::unique_lock<std::mutex> lock(work.mutex);
   if (timeout != nullptr) {
-    work.waiting.add(*timeout, from);
+    changeFiberState(currentFiber, Fiber::State::Running,
+                     Fiber::State::Waiting);
+    work.waiting.add(*timeout, currentFiber);
+  } else {
+    changeFiberState(currentFiber, Fiber::State::Running,
+                     Fiber::State::Yielded);
   }
 
   // First wait until there's something else this worker can do.
-  waitForWork(lock);
+  waitForWork();
 
   if (work.fibers.size() > 0) {
     // There's another fiber that has become unblocked, resume that.
     work.num--;
     auto to = take(work.fibers);
-    lock.unlock();
+    ASSERT_FIBER_STATE(to, Fiber::State::Queued);
+    work.mutex.unlock();
     switchToFiber(to);
+    work.mutex.lock();
   } else if (idleFibers.size() > 0) {
     // There's an old fiber we can reuse, resume that.
     auto to = take(idleFibers);
-    lock.unlock();
+    ASSERT_FIBER_STATE(to, Fiber::State::Idle);
+    work.mutex.unlock();
     switchToFiber(to);
+    work.mutex.lock();
   } else {
-    // Tasks to process and no existing fibers to resume. Spawn a new fiber.
-    lock.unlock();
+    // Tasks to process and no existing fibers to resume.
+    // Spawn a new fiber.
+    work.mutex.unlock();
     switchToFiber(createWorkerFiber());
+    work.mutex.lock();
   }
+
+  setFiberState(currentFiber, Fiber::State::Running);
 }
 
+_When_(return == true, _Acquires_lock_(work.mutex))
 bool Scheduler::Worker::tryLock() {
   return work.mutex.try_lock();
 }
 
 void Scheduler::Worker::enqueue(Fiber* fiber) {
   std::unique_lock<std::mutex> lock(work.mutex);
-  auto wasIdle = work.num == 0;
-  work.waiting.erase(fiber);
+  DBG_LOG("%d: ENQUEUE(%d %s)", (int)id, (int)fiber->id,
+          Fiber::toString(fiber->state));
+  switch (fiber->state) {
+    case Fiber::State::Running:
+    case Fiber::State::Queued:
+      return;  // Nothing to do here - task is already queued or running.
+    case Fiber::State::Waiting:
+      work.waiting.erase(fiber);
+      break;
+    case Fiber::State::Idle:
+    case Fiber::State::Yielded:
+      break;
+  }
+  bool wasIdle = work.num == 0;
   work.fibers.push(std::move(fiber));
+  MARL_ASSERT(!work.waiting.contains(fiber),
+              "fiber is unexpectedly in the waiting list");
+  setFiberState(fiber, Fiber::State::Queued);
   work.num++;
   lock.unlock();
+
   if (wasIdle) {
     work.added.notify_one();
   }
@@ -418,6 +512,8 @@
   enqueueAndUnlock(std::move(task));
 }
 
+_Requires_lock_held_(work.mutex)
+_Releases_lock_(work.mutex)
 void Scheduler::Worker::enqueueAndUnlock(Task&& task) {
   auto wasIdle = work.num == 0;
   work.tasks.push(std::move(task));
@@ -435,12 +531,13 @@
   if (!work.mutex.try_lock()) {
     return false;
   }
-  defer(work.mutex.unlock());
   if (work.tasks.size() == 0) {
+    work.mutex.unlock();
     return false;
   }
   work.num--;
   out = take(work.tasks);
+  work.mutex.unlock();
   return true;
 }
 
@@ -448,7 +545,7 @@
   MARL_ASSERT(mode == Mode::SingleThreaded,
               "flush() can only be used on a single-threaded worker");
   std::unique_lock<std::mutex> lock(work.mutex);
-  runUntilIdle(lock);
+  runUntilIdle();
 }
 
 void Scheduler::Worker::run() {
@@ -461,8 +558,8 @@
         work.added.wait(
             lock, [this] { return work.num > 0 || work.waiting || shutdown; });
         while (!shutdown || work.num > 0 || numBlockedFibers() > 0U) {
-          waitForWork(lock);
-          runUntilIdle(lock);
+          waitForWork();
+          runUntilIdle();
         }
         Worker::current = nullptr;
       }
@@ -470,6 +567,7 @@
       break;
     }
     case Mode::SingleThreaded:
+      ASSERT_FIBER_STATE(currentFiber, Fiber::State::Running);
       while (!shutdown) {
         flush();
         idleFibers.emplace(currentFiber);
@@ -482,37 +580,62 @@
   }
 }
 
-_Requires_lock_held_(lock) void Scheduler::Worker::waitForWork(
-    std::unique_lock<std::mutex>& lock) {
+_Requires_lock_held_(work.mutex)
+void Scheduler::Worker::waitForWork() {
   MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
               "work.num out of sync");
   if (work.num == 0 && mode == Mode::MultiThreaded) {
     scheduler->onBeginSpinning(id);
-    lock.unlock();
+    work.mutex.unlock();
     spinForWork();
-    lock.lock();
+    work.mutex.lock();
   }
 
   if (work.waiting) {
+    std::unique_lock<std::mutex> lock(work.mutex, std::adopt_lock);
     work.added.wait_until(lock, work.waiting.next(), [this] {
       return work.num > 0 || (shutdown && numBlockedFibers() == 0U);
     });
+    lock.release();  // Keep the lock held.
     enqueueFiberTimeouts();
   } else {
+    std::unique_lock<std::mutex> lock(work.mutex, std::adopt_lock);
     work.added.wait(lock, [this] {
       return work.num > 0 || (shutdown && numBlockedFibers() == 0U);
     });
+    lock.release();  // Keep the lock held.
   }
 }
 
-_Requires_lock_held_(lock) void Scheduler::Worker::enqueueFiberTimeouts() {
+_Requires_lock_held_(work.mutex)
+void Scheduler::Worker::enqueueFiberTimeouts() {
   auto now = std::chrono::system_clock::now();
   while (auto fiber = work.waiting.take(now)) {
+    changeFiberState(fiber, Fiber::State::Waiting, Fiber::State::Queued);
+    DBG_LOG("%d: TIMEOUT(%d)", (int)id, (int)fiber->id);
     work.fibers.push(fiber);
     work.num++;
   }
 }
 
+_Requires_lock_held_(work.mutex)
+void Scheduler::Worker::changeFiberState(Fiber* fiber,
+                                         Fiber::State from,
+                                         Fiber::State to) const {
+  (void)from;  // Unusued parameter when ENABLE_DEBUG_LOGGING is disabled.
+  DBG_LOG("%d: CHANGE_FIBER_STATE(%d %s -> %s)", (int)id, (int)fiber->id,
+          Fiber::toString(from), Fiber::toString(to));
+  ASSERT_FIBER_STATE(fiber, from);
+  fiber->state = to;
+}
+
+_Requires_lock_held_(work.mutex)
+void Scheduler::Worker::setFiberState(Fiber* fiber, Fiber::State to) const {
+  DBG_LOG("%d: SET_FIBER_STATE(%d %s -> %s)", (int)id, (int)fiber->id,
+          Fiber::toString(fiber->state), Fiber::toString(to));
+  fiber->state = to;
+}
+
 void Scheduler::Worker::spinForWork() {
   TRACE("SPIN");
   Task stolen;
@@ -544,8 +667,9 @@
   }
 }
 
-_Requires_lock_held_(lock) void Scheduler::Worker::runUntilIdle(
-    std::unique_lock<std::mutex>& lock) {
+_Requires_lock_held_(work.mutex)
+void Scheduler::Worker::runUntilIdle() {
+  ASSERT_FIBER_STATE(currentFiber, Fiber::State::Running);
   MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
               "work.num out of sync");
   while (work.fibers.size() > 0 || work.tasks.size() > 0) {
@@ -556,20 +680,28 @@
     while (work.fibers.size() > 0) {
       work.num--;
       auto fiber = take(work.fibers);
-      lock.unlock();
+      // Sanity checks,
+      MARL_ASSERT(idleFibers.count(fiber) == 0, "dequeued fiber is idle");
+      MARL_ASSERT(fiber != currentFiber, "dequeued fiber is currently running");
+      ASSERT_FIBER_STATE(fiber, Fiber::State::Queued);
 
-      auto added = idleFibers.emplace(currentFiber).second;
-      (void)added;
-      MARL_ASSERT(added, "fiber already idle");
+      changeFiberState(currentFiber, Fiber::State::Running, Fiber::State::Idle);
+      work.mutex.unlock();
+      {  // unlocked
+        auto added = idleFibers.emplace(currentFiber).second;
+        (void)added;
+        MARL_ASSERT(added, "fiber already idle");
 
-      switchToFiber(fiber);
-      lock.lock();
+        switchToFiber(fiber);
+      }
+      work.mutex.lock();
+      changeFiberState(currentFiber, Fiber::State::Idle, Fiber::State::Running);
     }
 
     if (work.tasks.size() > 0) {
       work.num--;
       auto task = take(work.tasks);
-      lock.unlock();
+      work.mutex.unlock();
 
       // Run the task.
       task();
@@ -578,13 +710,14 @@
       // Ensure these are destructed outside of the lock.
       task = Task();
 
-      lock.lock();
+      work.mutex.lock();
     }
   }
 }
 
 Scheduler::Fiber* Scheduler::Worker::createWorkerFiber() {
   auto fiberId = static_cast<uint32_t>(workerFibers.size() + 1);
+  DBG_LOG("%d: CREATE(%d)", (int)id, (int)fiberId);
   auto fiber = Fiber::create(scheduler->allocator, fiberId, FiberStackSize,
                              [&] { run(); });
   auto ptr = fiber.get();
@@ -593,6 +726,7 @@
 }
 
 void Scheduler::Worker::switchToFiber(Fiber* to) {
+  DBG_LOG("%d: SWITCH(%d -> %d)", (int)id, (int)currentFiber->id, (int)to->id);
   MARL_ASSERT(to == mainFiber.get() || idleFibers.count(to) == 0,
               "switching to idle fiber");
   auto from = currentFiber;
diff --git a/src/scheduler_bench.cpp b/src/scheduler_bench.cpp
new file mode 100644
index 0000000..3548e54
--- /dev/null
+++ b/src/scheduler_bench.cpp
@@ -0,0 +1,48 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/waitgroup.h"
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_DEFINE_F(Schedule, Empty)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([] {});
+      }
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Empty)->Apply(Schedule::args);
+
+BENCHMARK_DEFINE_F(Schedule, SomeWork)
+(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      wg.add(numTasks);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] {
+          benchmark::DoNotOptimize(doSomeWork(i));
+          wg.done();
+        });
+      }
+      wg.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, SomeWork)->Apply(Schedule::args);
diff --git a/src/ticket_bench.cpp b/src/ticket_bench.cpp
new file mode 100644
index 0000000..ae1e22c
--- /dev/null
+++ b/src/ticket_bench.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/defer.h"
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+#include "marl/ticket.h"
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_DEFINE_F(Schedule, Ticket)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::Ticket::Queue queue;
+      for (int i = 0; i < numTasks; i++) {
+        auto ticket = queue.take();
+        marl::schedule([ticket] {
+          ticket.wait();
+          ticket.done();
+        });
+      }
+      queue.take().wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Ticket)->Apply(Schedule::args<512>);
diff --git a/src/waitgroup_bench.cpp b/src/waitgroup_bench.cpp
new file mode 100644
index 0000000..2b29c27
--- /dev/null
+++ b/src/waitgroup_bench.cpp
@@ -0,0 +1,31 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/waitgroup.h"
+
+BENCHMARK_DEFINE_F(Schedule, WaitGroup)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      wg.add(numTasks);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] { wg.done(); });
+      }
+      wg.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, WaitGroup)->Apply(Schedule::args);
diff --git a/third_party/benchmark b/third_party/benchmark
new file mode 160000
index 0000000..5ac80de
--- /dev/null
+++ b/third_party/benchmark
@@ -0,0 +1 @@
+Subproject commit 5ac80de0379ae1153ca6ef141df89ecf53bf1110
diff --git a/tools/bench/bench.go b/tools/bench/bench.go
new file mode 100644
index 0000000..81f2d72
--- /dev/null
+++ b/tools/bench/bench.go
@@ -0,0 +1,151 @@
+// Copyright 2020 The Marl Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package bench provides types and methods for parsing Google benchmark results.
+package bench
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// Test holds the results of a single benchmark test.
+type Test struct {
+	Name       string
+	NumTasks   uint
+	NumThreads uint
+	Duration   time.Duration
+	Iterations uint
+}
+
+var testVarRE = regexp.MustCompile(`([\w])+:([0-9]+)`)
+
+func (t *Test) parseName() {
+	for _, match := range testVarRE.FindAllStringSubmatch(t.Name, -1) {
+		if len(match) != 3 {
+			continue
+		}
+		n, err := strconv.Atoi(match[2])
+		if err != nil {
+			continue
+		}
+		switch match[1] {
+		case "threads":
+			t.NumThreads = uint(n)
+		case "tasks":
+			t.NumTasks = uint(n)
+		}
+	}
+}
+
+// Benchmark holds a set of benchmark test results.
+type Benchmark struct {
+	Tests []Test
+}
+
+// Parse parses the benchmark results from the string s.
+// Parse will handle the json and 'console' formats.
+func Parse(s string) (Benchmark, error) {
+	type Parser = func(s string) (Benchmark, error)
+	for _, parser := range []Parser{parseConsole, parseJSON} {
+		b, err := parser(s)
+		switch err {
+		case nil:
+			return b, nil
+		case errWrongFormat:
+			break
+		default:
+			return Benchmark{}, err
+		}
+	}
+
+	return Benchmark{}, errors.New("Unrecognised file format")
+}
+
+var errWrongFormat = errors.New("Wrong format")
+var consoleLineRE = regexp.MustCompile(`([\w/:]+)\s+([0-9]+(?:.[0-9]+)?) ns\s+[0-9]+(?:.[0-9]+) ns\s+([0-9]+)`)
+
+func parseConsole(s string) (Benchmark, error) {
+	blocks := strings.Split(s, "------------------------------------------------------------------------------------------")
+	if len(blocks) != 3 {
+		return Benchmark{}, errWrongFormat
+	}
+
+	lines := strings.Split(blocks[2], "\n")
+	b := Benchmark{
+		Tests: make([]Test, 0, len(lines)),
+	}
+	for _, line := range lines {
+		if len(line) == 0 {
+			continue
+		}
+		matches := consoleLineRE.FindStringSubmatch(line)
+		if len(matches) != 4 {
+			return Benchmark{}, fmt.Errorf("Unable to parse the line:\n" + line)
+		}
+		ns, err := strconv.ParseFloat(matches[2], 64)
+		if err != nil {
+			return Benchmark{}, fmt.Errorf("Unable to parse the duration: " + matches[2])
+		}
+		iterations, err := strconv.Atoi(matches[3])
+		if err != nil {
+			return Benchmark{}, fmt.Errorf("Unable to parse the number of iterations: " + matches[3])
+		}
+
+		t := Test{
+			Name:       matches[1],
+			Duration:   time.Nanosecond * time.Duration(ns),
+			Iterations: uint(iterations),
+		}
+		t.parseName()
+		b.Tests = append(b.Tests, t)
+	}
+	return b, nil
+}
+
+func parseJSON(s string) (Benchmark, error) {
+	type T struct {
+		Name       string  `json:"name"`
+		Iterations uint    `json:"iterations"`
+		Time       float64 `json:"real_time"`
+	}
+	type B struct {
+		Tests []T `json:"benchmarks"`
+	}
+	b := B{}
+	d := json.NewDecoder(strings.NewReader(s))
+	if err := d.Decode(&b); err != nil {
+		return Benchmark{}, err
+	}
+
+	out := Benchmark{
+		Tests: make([]Test, len(b.Tests)),
+	}
+	for i, test := range b.Tests {
+		t := Test{
+			Name:       test.Name,
+			Duration:   time.Nanosecond * time.Duration(int64(test.Time)),
+			Iterations: test.Iterations,
+		}
+		t.parseName()
+		out.Tests[i] = t
+	}
+
+	return out, nil
+}
diff --git a/tools/cmd/benchdiff/main.go b/tools/cmd/benchdiff/main.go
new file mode 100644
index 0000000..8cd93cf
--- /dev/null
+++ b/tools/cmd/benchdiff/main.go
@@ -0,0 +1,154 @@
+// Copyright 2020 The Marl Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// benchdiff is a tool that compares two Google benchmark results and displays
+// sorted performance differences.
+package main
+
+import (
+	"errors"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sort"
+	"text/tabwriter"
+	"time"
+
+	"../../bench"
+)
+
+var (
+	minDiff    = flag.Duration("min-diff", time.Microsecond*10, "Filter away time diffs less than this duration")
+	minRelDiff = flag.Float64("min-rel-diff", 0.01, "Filter away absolute relative diffs between [1, 1+x]")
+)
+
+func main() {
+	flag.ErrHelp = errors.New("benchdiff is a tool to compare two benchmark results")
+	flag.Parse()
+	flag.Usage = func() {
+		fmt.Fprintln(os.Stderr, "benchdiff <benchmark-a> <benchmark-b>")
+		flag.PrintDefaults()
+	}
+
+	args := flag.Args()
+	if len(args) < 2 {
+		flag.Usage()
+		os.Exit(1)
+	}
+
+	pathA, pathB := args[0], args[1]
+
+	if err := run(pathA, pathB); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(-1)
+	}
+}
+
+func run(pathA, pathB string) error {
+	fileA, err := ioutil.ReadFile(pathA)
+	if err != nil {
+		return err
+	}
+	benchA, err := bench.Parse(string(fileA))
+	if err != nil {
+		return err
+	}
+
+	fileB, err := ioutil.ReadFile(pathB)
+	if err != nil {
+		return err
+	}
+	benchB, err := bench.Parse(string(fileB))
+	if err != nil {
+		return err
+	}
+
+	compare(benchA, benchB, fileName(pathA), fileName(pathB))
+
+	return nil
+}
+
+func fileName(path string) string {
+	_, name := filepath.Split(path)
+	return name
+}
+
+func compare(benchA, benchB bench.Benchmark, nameA, nameB string) {
+	type times struct {
+		a time.Duration
+		b time.Duration
+	}
+	byName := map[string]times{}
+	for _, test := range benchA.Tests {
+		byName[test.Name] = times{a: test.Duration}
+	}
+	for _, test := range benchB.Tests {
+		t := byName[test.Name]
+		t.b = test.Duration
+		byName[test.Name] = t
+	}
+
+	type delta struct {
+		name       string
+		times      times
+		relDiff    float64
+		absRelDiff float64
+	}
+	deltas := []delta{}
+	for name, times := range byName {
+		if times.a == 0 || times.b == 0 {
+			continue // Assuming test was missing from a or b
+		}
+		diff := times.b - times.a
+		absDiff := diff
+		if absDiff < 0 {
+			absDiff = -absDiff
+		}
+		if absDiff < *minDiff {
+			continue
+		}
+
+		relDiff := float64(times.b) / float64(times.a)
+		absRelDiff := relDiff
+		if absRelDiff < 1 {
+			absRelDiff = 1.0 / absRelDiff
+		}
+		if absRelDiff < (1.0 + *minRelDiff) {
+			continue
+		}
+
+		d := delta{
+			name:       name,
+			times:      times,
+			relDiff:    relDiff,
+			absRelDiff: absRelDiff,
+		}
+		deltas = append(deltas, d)
+	}
+
+	sort.Slice(deltas, func(i, j int) bool { return deltas[j].relDiff < deltas[i].relDiff })
+
+	w := tabwriter.NewWriter(os.Stdout, 1, 1, 0, ' ', 0)
+	fmt.Fprintf(w, "Delta\t | Test name\t | (A) %v\t | (B) %v\n", nameA, nameB)
+	for _, delta := range deltas {
+		sign, diff := "+", delta.times.b-delta.times.a
+		if diff < 0 {
+			sign, diff = "-", -diff
+		}
+		fmt.Fprintf(w, "%v%.2fx %v%+v\t | %v\t | %v\t | %v\n", sign, delta.absRelDiff, sign, diff, delta.name, delta.times.a, delta.times.b)
+	}
+	w.Flush()
+}