Subzero: add support for large stacks on Windows

If the stack size is > 4K, emit chkstk, which probes the stack to commit
the pages required to support the large stack.

Bug: swiftshader:25
Change-Id: I6b9f09218736ffb641cb1dbf95a1de7149633ef8
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/41608
Presubmit-Ready: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index f95ba71..5eabcb6 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -18,6 +18,7 @@
 
 #include "gtest/gtest.h"
 
+#include <array>
 #include <cmath>
 #include <thread>
 #include <tuple>
@@ -1501,6 +1502,53 @@
 	}
 }
 
+// This test was written because on Windows with Subzero, we would get a crash when executing a function
+// with a large number of local variables. The problem was that on Windows, 4K pages are allocated as
+// needed for the stack whenever an access is made in a "guard page", at which point the page is committed,
+// and the next 4K page becomes the guard page. If a stack access is made that's beyond the guard page,
+// a regular page fault occurs. To fix this, Subzero (and any compiler) now emits a call to __chkstk with
+// the stack size in EAX, so that it can probe the stack in 4K increments up to that size, committing the
+// required pages. See https://docs.microsoft.com/en-us/windows/win32/devnotes/-win32-chkstk.
+TEST(ReactorUnitTests, LargeStack)
+{
+#if defined(_WIN32)
+	// An empirically large enough value to access outside the guard pages
+	constexpr int ArrayByteSize = 24 * 1024;
+	constexpr int ArraySize = ArrayByteSize / sizeof(int32_t);
+
+	FunctionT<void(int32_t * v)> function;
+	{
+		// Allocate a stack array large enough that writing to the first element will reach beyond
+		// the guard page.
+		Array<Int, ArraySize> largeStackArray;
+		for(int i = 0; i < ArraySize; ++i)
+		{
+			largeStackArray[i] = i;
+		}
+
+		Pointer<Int> in = function.Arg<0>();
+		for(int i = 0; i < ArraySize; ++i)
+		{
+			in[i] = largeStackArray[i];
+		}
+	}
+
+	auto routine = function("one");
+	std::array<int32_t, ArraySize> v;
+
+	// Run this in a thread, so that we get the default reserved stack size (8K on Win64).
+	std::thread t([&] {
+		routine(v.data());
+	});
+	t.join();
+
+	for(int i = 0; i < ArraySize; ++i)
+	{
+		EXPECT_EQ(v[i], i);
+	}
+#endif
+}
+
 TEST(ReactorUnitTests, Call)
 {
 	struct Class
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.cpp b/third_party/subzero/src/IceTargetLoweringX8632.cpp
index 0a7a56e..3262279 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -17,6 +17,10 @@
 
 #include "IceTargetLoweringX8632Traits.h"
 
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+extern "C" void _chkstk();
+#endif
+
 namespace X8632 {
 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
   return ::Ice::X8632::TargetX8632::create(Func);
@@ -402,6 +406,32 @@
   lowerIndirectJump(T_ecx);
 }
 
+void TargetX8632::emitStackProbe(size_t StackSizeBytes) {
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+  if (StackSizeBytes >= 4096) {
+    // _chkstk on Win32 is actually __alloca_probe, which adjusts ESP by the
+    // stack amount specified in EAX, so we save ESP in ECX, and restore them
+    // both after the call.
+
+    Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    Variable *ESP = makeReg(IceType_i32, Traits::RegisterSet::Reg_esp);
+    Variable *ECX = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
+
+    _push_reg(ECX->getRegNum());
+    _mov(ECX, ESP);
+
+    _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
+
+    auto *CallTarget =
+        Ctx->getConstantInt32(reinterpret_cast<int32_t>(&_chkstk));
+    emitCallToTarget(CallTarget, nullptr);
+
+    _mov(ESP, ECX);
+    _pop_reg(ECX->getRegNum());
+  }
+#endif
+}
+
 // In some cases, there are x-macros tables for both high-level and low-level
 // instructions/operands that use the same enum key value. The tables are kept
 // separate to maintain a proper separation between abstraction layers. There
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.h b/third_party/subzero/src/IceTargetLoweringX8632.h
index 2715b0f..349fb92 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632.h
@@ -59,6 +59,7 @@
   void initSandbox();
   bool legalizeOptAddrForSandbox(OptAddr *Addr);
   void emitSandboxedReturn();
+  void emitStackProbe(size_t StackSizeBytes);
   void lowerIndirectJump(Variable *JumpTarget);
   void emitGetIP(CfgNode *Node);
   Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.cpp b/third_party/subzero/src/IceTargetLoweringX8664.cpp
index 9cfab50..5ec9e34 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -17,6 +17,10 @@
 #include "IceDefs.h"
 #include "IceTargetLoweringX8664Traits.h"
 
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+extern "C" void __chkstk();
+#endif
+
 namespace X8664 {
 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
   return ::Ice::X8664::TargetX8664::create(Func);
@@ -758,6 +762,26 @@
   }
 }
 
+void TargetX8664::emitStackProbe(size_t StackSizeBytes) {
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+  // Mirroring the behavior of MSVC here, which emits a _chkstk when locals are
+  // >= 4KB, rather than the 8KB claimed by the docs.
+  if (StackSizeBytes >= 4096) {
+    // __chkstk on Win64 probes the stack up to RSP - EAX, but does not clobber
+    // RSP, so we don't need to save and restore it.
+
+    Variable *EAX = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
+    _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
+
+    auto *CallTarget =
+        Ctx->getConstantInt64(reinterpret_cast<int64_t>(&__chkstk));
+    Operand *CallTargetReg =
+        legalizeToReg(CallTarget, Traits::RegisterSet::Reg_r11);
+    emitCallToTarget(CallTargetReg, nullptr);
+  }
+#endif
+}
+
 // In some cases, there are x-macros tables for both high-level and low-level
 // instructions/operands that use the same enum key value. The tables are kept
 // separate to maintain a proper separation between abstraction layers. There
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.h b/third_party/subzero/src/IceTargetLoweringX8664.h
index ec24df6..3f33050 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -62,6 +62,7 @@
   void initSandbox();
   bool legalizeOptAddrForSandbox(OptAddr *Addr);
   void emitSandboxedReturn();
+  void emitStackProbe(size_t StackSizeBytes);
   void lowerIndirectJump(Variable *JumpTarget);
   void emitGetIP(CfgNode *Node);
   Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) override;
diff --git a/third_party/subzero/src/IceTargetLoweringX86Base.h b/third_party/subzero/src/IceTargetLoweringX86Base.h
index 9d60609..46df7be 100644
--- a/third_party/subzero/src/IceTargetLoweringX86Base.h
+++ b/third_party/subzero/src/IceTargetLoweringX86Base.h
@@ -376,6 +376,12 @@
   void emitSandboxedReturn() {
     dispatchToConcrete(&Traits::ConcreteTarget::emitSandboxedReturn);
   }
+
+  void emitStackProbe(size_t StackSizeBytes) {
+    dispatchToConcrete(&Traits::ConcreteTarget::emitStackProbe,
+                       std::move(StackSizeBytes));
+  }
+
   /// Emit just the call instruction (without argument or return variable
   /// processing), sandboxing if needed.
   virtual Inst *emitCallToTarget(Operand *CallTarget, Variable *ReturnReg) = 0;
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index becbbed..5b19e7c 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -1199,6 +1199,8 @@
   SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
 
   if (SpillAreaSizeBytes) {
+    emitStackProbe(SpillAreaSizeBytes);
+
     // Generate "sub stackptr, SpillAreaSizeBytes"
     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
   }