Subzero: fix calling C functions on Windows x64
This change addresses the following issues:
* Microsoft x64 ABI assigns registers to the first four arguments by
argument position, not by type count.
* Microsoft x64 ABI expects caller to allocate space to copy 4 register
arguments to stack, called the Shadow Store or Home Space.
* Fix bug where preserved register area size was not computed correctly
when Xmm registers were being preserved, as it was assuming all
preserved registers were 8 bytes large.
Bug: b/142132927
Change-Id: Ibc2d82ab117c062eed2e7f66109c9d6bbdc09a8b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/37272
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 91feb65..29bc817 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -160,7 +160,7 @@
{
const Capabilities Caps =
{
- false, // CallSupported
+ true, // CallSupported
false, // CoroutinesSupported
};
diff --git a/third_party/subzero/src/IceTargetLoweringX8632Traits.h b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
index 380ba00..092457d 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632Traits.h
@@ -694,6 +694,12 @@
(void)ArgNum;
return RegNumT();
}
+ // Given the absolute argument position and argument position by type, return
+ // the register index to assign it to.
+ static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+ (void)argPos;
+ return argPosByType;
+ };
/// The number of bits in a byte
static constexpr uint32_t X86_CHAR_BIT = 8;
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Traits.h b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
index ba67c69..e82b03a 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
@@ -742,6 +742,15 @@
assert(Ty == IceType_i64 || Ty == IceType_i32);
return getGprForType(Ty, GprForArgNum[ArgNum]);
}
+ // Given the absolute argument position and argument position by type, return
+ // the register index to assign it to.
+ static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+ // Microsoft x64 ABI: register is selected by arg position (e.g. 1st int as
+ // 2nd param goes into 2nd int reg)
+ (void)argPosByType;
+ return argPos;
+ };
+
#else
// System V x86-64 calling convention:
//
@@ -774,6 +783,12 @@
assert(Ty == IceType_i64 || Ty == IceType_i32);
return getGprForType(Ty, GprForArgNum[ArgNum]);
}
+ // Given the absolute argument position and argument position by type, return
+ // the register index to assign it to.
+ static SizeT getArgIndex(SizeT argPos, SizeT argPosByType) {
+ (void)argPos;
+ return argPosByType;
+ }
#endif
/// Whether scalar floating point arguments are passed in XMM registers
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index 00734fd..523f80c 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -996,9 +996,9 @@
void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
// Stack frame layout:
//
- // +------------------------+
- // | 1. return address |
- // +------------------------+
+ // +------------------------+ ^ +
+ // | 1. return address | |
+ // +------------------------+ v -
// | 2. preserved registers |
// +------------------------+ <--- BasePointer (if used)
// | 3. padding |
@@ -1011,6 +1011,8 @@
// +------------------------+
// | 7. padding |
// +------------------------+
+ // | 7.5 shadow (WinX64) |
+ // +------------------------+
// | 8. allocas |
// +------------------------+
// | 9. padding |
@@ -1040,6 +1042,17 @@
// space on the frame for globals (variables with multi-block lifetime), and
// one block to share for locals (single-block lifetime).
+ // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
+ // "shadow store" (aka "home space") so that the callee may copy the 4
+ // register args to it.
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+ const SizeT ShadowStoreSize = Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
+#else
+ const SizeT ShadowStoreSize = 0;
+#endif
+
+ // StackPointer: points just past return address of calling function
+
Context.init(Node);
Context.setInsertPoint(Context.getCur());
@@ -1092,11 +1105,17 @@
for (RegNumT RegNum : RegNumBVIter(Pushed)) {
assert(RegNum == Traits::getBaseReg(RegNum));
++NumCallee;
- PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+ if (Traits::isXmm(RegNum)) {
+ PreservedRegsSizeBytes += 16;
+ } else {
+ PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
+ }
_push_reg(RegNum);
}
Ctx->statsUpdateRegistersSaved(NumCallee);
+ // StackPointer: points past preserved registers at start of spill area
+
// Generate "push frameptr; mov frameptr, stackptr"
if (IsEbpBasedFrame) {
assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
@@ -1148,20 +1167,29 @@
if (PrologEmitsFixedAllocas)
SpillAreaSizeBytes += FixedAllocaSizeBytes;
+ // Win64 ABI: add space for shadow store (aka home space)
+ SpillAreaSizeBytes += ShadowStoreSize;
+
// Entering the function has made the stack pointer unaligned. Re-align it by
// adjusting the stack size.
- uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
+ // Note that StackOffset does not include spill area. It's the offset from the
+ // base stack pointer (epb), whether we set it or not, to the the first stack
+ // arg (if any). StackSize, on the other hand, does include the spill area.
+ const uint32_t StackOffset =
+ ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
RequiredStackAlignment);
StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
RequiredStackAlignment);
- SpillAreaSizeBytes = StackSize - StackOffset;
+ SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
if (SpillAreaSizeBytes) {
// Generate "sub stackptr, SpillAreaSizeBytes"
_sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
}
+ // StackPointer: points just past the spill area (end of stack frame)
+
// If the required alignment is greater than the stack pointer's guaranteed
// alignment, align the stack pointer accordingly.
if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
@@ -1170,6 +1198,8 @@
Ctx->getConstantInt32(-RequiredStackAlignment));
}
+ // StackPointer: may have just been offset for alignment
+
// Account for known-frame-offset alloca instructions that were not already
// combined into the prolog.
if (!PrologEmitsFixedAllocas)
@@ -1182,8 +1212,7 @@
// Arg[0] is closest to the stack/frame pointer.
RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
- size_t BasicFrameOffset =
- PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
+ size_t BasicFrameOffset = StackOffset;
if (!IsEbpBasedFrame)
BasicFrameOffset += SpillAreaSizeBytes;
@@ -1193,22 +1222,26 @@
size_t InArgsSizeBytes = 0;
unsigned NumXmmArgs = 0;
unsigned NumGPRArgs = 0;
- for (Variable *Arg : Args) {
+ for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
+ Variable *Arg = Args[i];
// Skip arguments passed in registers.
if (isVectorType(Arg->getType())) {
- if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
+ if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+ .hasValue()) {
++NumXmmArgs;
continue;
}
} else if (isScalarFloatingType(Arg->getType())) {
if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
- Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
+ Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
+ .hasValue()) {
++NumXmmArgs;
continue;
}
} else {
assert(isScalarIntegerType(Arg->getType()));
- if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs)
+ if (Traits::getRegisterForGprArgNum(Traits::WordType,
+ Traits::getArgIndex(i, NumGPRArgs))
.hasValue()) {
++NumGPRArgs;
continue;
@@ -1551,7 +1584,8 @@
Variable *RegisterArg = nullptr;
RegNumT RegNum;
if (isVectorType(Ty)) {
- RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
+ RegNum =
+ Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
if (RegNum.hasNoValue()) {
XmmSlotsRemain = false;
continue;
@@ -1562,7 +1596,8 @@
if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
continue;
}
- RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
+ RegNum =
+ Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
if (RegNum.hasNoValue()) {
XmmSlotsRemain = false;
continue;
@@ -1570,7 +1605,8 @@
++NumXmmArgs;
RegisterArg = Func->makeVariable(Ty);
} else if (isScalarIntegerType(Ty)) {
- RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs);
+ RegNum = Traits::getRegisterForGprArgNum(
+ Ty, Traits::getArgIndex(i, NumGprArgs));
if (RegNum.hasNoValue()) {
GprSlotsRemain = false;
continue;
@@ -2617,11 +2653,14 @@
RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
Traits::X86_STACK_ALIGNMENT_BYTES);
- using OperandList =
- llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
- Traits::X86_MAX_GPR_ARGS)>;
+ constexpr SizeT MaxOperands =
+ constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
+ using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
+
OperandList XmmArgs;
+ llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
CfgVector<std::pair<const Type, Operand *>> GprArgs;
+ CfgVector<SizeT> GprArgIndices;
OperandList StackArgs, StackArgLocations;
uint32_t ParameterAreaSizeBytes = 0;
@@ -2633,14 +2672,22 @@
// The PNaCl ABI requires the width of arguments to be at least 32 bits.
assert(typeWidthInBytes(Ty) >= 4);
if (isVectorType(Ty) &&
- Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
+ Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
+ .hasValue()) {
XmmArgs.push_back(Arg);
+ XmmArgIndices.push_back(i);
} else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
- Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
+ Traits::getRegisterForXmmArgNum(
+ Traits::getArgIndex(i, XmmArgs.size()))
+ .hasValue()) {
XmmArgs.push_back(Arg);
+ XmmArgIndices.push_back(i);
} else if (isScalarIntegerType(Ty) &&
- Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) {
+ Traits::getRegisterForGprArgNum(
+ Ty, Traits::getArgIndex(i, GprArgs.size()))
+ .hasValue()) {
GprArgs.emplace_back(Ty, Arg);
+ GprArgIndices.push_back(i);
} else {
// Place on stack.
StackArgs.push_back(Arg);
@@ -2678,16 +2725,18 @@
}
// Copy arguments to be passed in registers to the appropriate registers.
for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
- XmmArgs[i] =
- legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i));
+ XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
+ Traits::getRegisterForXmmArgNum(
+ Traits::getArgIndex(XmmArgIndices[i], i)));
}
// Materialize moves for arguments passed in GPRs.
for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
const Type SignatureTy = GprArgs[i].first;
Operand *Arg =
legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
- GprArgs[i].second =
- legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i));
+ GprArgs[i].second = legalizeToReg(
+ Arg, Traits::getRegisterForGprArgNum(
+ Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
assert(SignatureTy == Arg->getType());
(void)SignatureTy;