Subzero: fix calling C functions on Windows x64

This change addresses the following issues:
* Microsoft x64 ABI assigns registers to the first four arguments by
argument position, not by type count.
* Microsoft x64 ABI expects caller to allocate space to copy 4 register
arguments to stack, called the Shadow Store or Home Space.
* Fix bug where preserved register area size was not computed correctly
when Xmm registers were being preserved, as it was assuming all
preserved registers were 8 bytes large.

Bug: b/142132927
Change-Id: Ibc2d82ab117c062eed2e7f66109c9d6bbdc09a8b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/37272
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 91feb65..29bc817 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -160,7 +160,7 @@
 {
 	const Capabilities Caps =
 	{
-		false, // CallSupported
+		true, // CallSupported
 		false, // CoroutinesSupported
 	};
 
diff --git a/third_party/subzero/src/IceTargetLoweringX8632Traits.h b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
index 380ba00..092457d 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
@@ -694,6 +694,12 @@
     (void)ArgNum;
     return RegNumT();
   }
+  // Given the absolute argument position and argument position by type, return
+  // the register index to assign it to.
+  static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+    (void)argPos;
+    return argPosByType;
+  };
 
   /// The number of bits in a byte
   static constexpr uint32_t X86_CHAR_BIT = 8;
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Traits.h b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
index ba67c69..e82b03a 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
@@ -742,6 +742,15 @@
     assert(Ty == IceType_i64 || Ty == IceType_i32);
     return getGprForType(Ty, GprForArgNum[ArgNum]);
   }
+  // Given the absolute argument position and argument position by type, return
+  // the register index to assign it to.
+  static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+    // Microsoft x64 ABI: register is selected by arg position (e.g. 1st int as
+    // 2nd param goes into 2nd int reg)
+    (void)argPosByType;
+    return argPos;
+  };
+
 #else
   // System V x86-64 calling convention:
   //
@@ -774,6 +783,12 @@
     assert(Ty == IceType_i64 || Ty == IceType_i32);
     return getGprForType(Ty, GprForArgNum[ArgNum]);
   }
+  // Given the absolute argument position and argument position by type, return
+  // the register index to assign it to.
+  static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+    (void)argPos;
+    return argPosByType;
+  }
 #endif
 
   /// Whether scalar floating point arguments are passed in XMM registers
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index 00734fd..523f80c 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -996,9 +996,9 @@
 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
   // Stack frame layout:
   //
-  // +------------------------+
-  // | 1. return address      |
-  // +------------------------+
+  // +------------------------+  ^ +
+  // | 1. return address      |  |
+  // +------------------------+  v -
   // | 2. preserved registers |
   // +------------------------+ <--- BasePointer (if used)
   // | 3. padding             |
@@ -1011,6 +1011,8 @@
   // +------------------------+
   // | 7. padding             |
   // +------------------------+
+  // | 7.5 shadow (WinX64)    |
+  // +------------------------+
   // | 8. allocas             |
   // +------------------------+
   // | 9. padding             |
@@ -1040,6 +1042,17 @@
   // space on the frame for globals (variables with multi-block lifetime), and
   // one block to share for locals (single-block lifetime).
 
+  // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
+  // "shadow store" (aka "home space") so that the callee may copy the 4
+  // register args to it.
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+	const SizeT ShadowStoreSize = Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
+#else
+	const SizeT ShadowStoreSize = 0;
+#endif
+
+  // StackPointer: points just past return address of calling function
+
   Context.init(Node);
   Context.setInsertPoint(Context.getCur());
 
@@ -1092,11 +1105,17 @@
   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
     assert(RegNum == Traits::getBaseReg(RegNum));
     ++NumCallee;
-    PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    if (Traits::isXmm(RegNum)) {
+      PreservedRegsSizeBytes += 16;
+    } else {
+      PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+    }
     _push_reg(RegNum);
   }
   Ctx->statsUpdateRegistersSaved(NumCallee);
 
+  // StackPointer: points past preserved registers at start of spill area
+
   // Generate "push frameptr; mov frameptr, stackptr"
   if (IsEbpBasedFrame) {
     assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
@@ -1148,20 +1167,29 @@
   if (PrologEmitsFixedAllocas)
     SpillAreaSizeBytes += FixedAllocaSizeBytes;
 
+  // Win64 ABI: add space for shadow store (aka home space)
+  SpillAreaSizeBytes += ShadowStoreSize;
+
   // Entering the function has made the stack pointer unaligned. Re-align it by
   // adjusting the stack size.
-  uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+  // Note that StackOffset does not include spill area. It's the offset from the
+  // base stack pointer (epb), whether we set it or not, to the the first stack
+  // arg (if any). StackSize, on the other hand, does include the spill area.
+  const uint32_t StackOffset =
+      ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
                                              RequiredStackAlignment);
   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
                                     RequiredStackAlignment);
-  SpillAreaSizeBytes = StackSize - StackOffset;
+  SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
 
   if (SpillAreaSizeBytes) {
     // Generate "sub stackptr, SpillAreaSizeBytes"
     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
   }
 
+  // StackPointer: points just past the spill area (end of stack frame)
+
   // If the required alignment is greater than the stack pointer's guaranteed
   // alignment, align the stack pointer accordingly.
   if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
@@ -1170,6 +1198,8 @@
          Ctx->getConstantInt32(-RequiredStackAlignment));
   }
 
+  // StackPointer: may have just been offset for alignment
+
   // Account for known-frame-offset alloca instructions that were not already
   // combined into the prolog.
   if (!PrologEmitsFixedAllocas)
@@ -1182,8 +1212,7 @@
   // Arg[0] is closest to the stack/frame pointer.
   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
-  size_t BasicFrameOffset =
-      PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
+  size_t BasicFrameOffset = StackOffset;
   if (!IsEbpBasedFrame)
     BasicFrameOffset += SpillAreaSizeBytes;
 
@@ -1193,22 +1222,26 @@
   size_t InArgsSizeBytes = 0;
   unsigned NumXmmArgs = 0;
   unsigned NumGPRArgs = 0;
-  for (Variable *Arg : Args) {
+  for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
+    Variable *Arg = Args[i];
     // Skip arguments passed in registers.
     if (isVectorType(Arg->getType())) {
-      if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
+      if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
         ++NumXmmArgs;
         continue;
       }
     } else if (isScalarFloatingType(Arg->getType())) {
       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
-          Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+              .hasValue()) {
         ++NumXmmArgs;
         continue;
       }
     } else {
       assert(isScalarIntegerType(Arg->getType()));
-      if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs)
+      if (Traits::getRegisterForGprArgNum(Traits::WordType,
+                                          Traits::getArgIndex(i, NumGPRArgs))
               .hasValue()) {
         ++NumGPRArgs;
         continue;
@@ -1551,7 +1584,8 @@
     Variable *RegisterArg = nullptr;
     RegNumT RegNum;
     if (isVectorType(Ty)) {
-      RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
       if (RegNum.hasNoValue()) {
         XmmSlotsRemain = false;
         continue;
@@ -1562,7 +1596,8 @@
       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
         continue;
       }
-      RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
+      RegNum =
+          Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
       if (RegNum.hasNoValue()) {
         XmmSlotsRemain = false;
         continue;
@@ -1570,7 +1605,8 @@
       ++NumXmmArgs;
       RegisterArg = Func->makeVariable(Ty);
     } else if (isScalarIntegerType(Ty)) {
-      RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs);
+      RegNum = Traits::getRegisterForGprArgNum(
+          Ty, Traits::getArgIndex(i, NumGprArgs));
       if (RegNum.hasNoValue()) {
         GprSlotsRemain = false;
         continue;
@@ -2617,11 +2653,14 @@
   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
                                             Traits::X86_STACK_ALIGNMENT_BYTES);
 
-  using OperandList =
-      llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
-                                                Traits::X86_MAX_GPR_ARGS)>;
+  constexpr SizeT MaxOperands =
+      constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
+  using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
+
   OperandList XmmArgs;
+  llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
   CfgVector<std::pair<const Type, Operand *>> GprArgs;
+  CfgVector<SizeT> GprArgIndices;
   OperandList StackArgs, StackArgLocations;
   uint32_t ParameterAreaSizeBytes = 0;
 
@@ -2633,14 +2672,22 @@
     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
     assert(typeWidthInBytes(Ty) >= 4);
     if (isVectorType(Ty) &&
-        Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
+        Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
+            .hasValue()) {
       XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
-               Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
+               Traits::getRegisterForXmmArgNum(
+                   Traits::getArgIndex(i, XmmArgs.size()))
+                   .hasValue()) {
       XmmArgs.push_back(Arg);
+      XmmArgIndices.push_back(i);
     } else if (isScalarIntegerType(Ty) &&
-               Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) {
+               Traits::getRegisterForGprArgNum(
+                   Ty, Traits::getArgIndex(i, GprArgs.size()))
+                   .hasValue()) {
       GprArgs.emplace_back(Ty, Arg);
+      GprArgIndices.push_back(i);
     } else {
       // Place on stack.
       StackArgs.push_back(Arg);
@@ -2678,16 +2725,18 @@
   }
   // Copy arguments to be passed in registers to the appropriate registers.
   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
-    XmmArgs[i] =
-        legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i));
+    XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
+                               Traits::getRegisterForXmmArgNum(
+                                   Traits::getArgIndex(XmmArgIndices[i], i)));
   }
   // Materialize moves for arguments passed in GPRs.
   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
     const Type SignatureTy = GprArgs[i].first;
     Operand *Arg =
         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
-    GprArgs[i].second =
-        legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i));
+    GprArgs[i].second = legalizeToReg(
+        Arg, Traits::getRegisterForGprArgNum(
+                 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
     assert(SignatureTy == Arg->getType());
     (void)SignatureTy;