diff --git a/runtime/codert_vm/arm64nathelp.m4 b/runtime/codert_vm/arm64nathelp.m4 index 7958aacdee1..f8693d01d0e 100644 --- a/runtime/codert_vm/arm64nathelp.m4 +++ b/runtime/codert_vm/arm64nathelp.m4 @@ -165,8 +165,7 @@ define({OLD_DUAL_MODE_HELPER_NO_RETURN_VALUE},{ define({NEW_DUAL_MODE_HELPER},{ DECLARE_EXTERN(fast_$1) - START_PROC($1) - SAVE_FPLR + BEGIN_HELPER($1) CALL_DIRECT(fast_$1) cbz x0,.L_done_$1 ldr x30,JIT_GPR_SAVE_SLOT(30) @@ -190,8 +189,7 @@ define({NEW_DUAL_MODE_HELPER},{ define({NEW_DUAL_MODE_HELPER_NO_RETURN_VALUE},{ DECLARE_EXTERN(fast_$1) - START_PROC($1) - SAVE_FPLR + BEGIN_HELPER($1) CALL_DIRECT(fast_$1) cbz x0,.L_done_$1 ldr x30,JIT_GPR_SAVE_SLOT(30) diff --git a/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp b/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp index 8ec958015d8..0d9487253bb 100644 --- a/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp +++ b/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp @@ -395,7 +395,6 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen { const TR::ARM64LinkageProperties &properties = _systemLinkage->getProperties(); TR::ARM64MemoryArgument *pushToMemory = NULL; - TR::Register *argMemReg; TR::Register *tempReg; int32_t argIndex = 0; int32_t numMemArgs = 0; @@ -503,12 +502,18 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen if (numMemArgs > 0) { pushToMemory = new (trStackMemory()) TR::ARM64MemoryArgument[numMemArgs]; - - argMemReg = cg()->allocateRegister(); } // align to 16-byte boundary totalSize = (totalSize + 15) & (~15); + /* + * We have J9_INLINE_JNI_MAX_ARG_COUNT (32) slots in interpreter frame. + * If the size of the native stack used for passing parameters exceeds this size, + * it will overwrite preserved registers. + * The number of parameters is guaranteed to be <= J9_INLINE_JNI_MAX_ARG_COUNT + * when HasFixedFrameC_CallingConvention codegen flag is set. + */ + TR_ASSERT_FATAL(totalSize <= sizeof(UDATA) * J9_INLINE_JNI_MAX_ARG_COUNT, "totalSize must not be larger than J9_INLINE_JNI_MAX_ARG_COUNT slots"); numIntegerArgs = 0; numFloatArgs = 0; @@ -522,6 +527,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #if defined(OSX) sigCursor = (char *)sig; #endif + TR::RealRegister *sp = cg()->machine()->getRealRegister(properties.getStackPointerRegister()); for (i = firstArgumentChild; i < callNode->getNumChildren(); i++) { @@ -611,7 +617,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #else #error Unsupported platform #endif - getOutgoingArgumentMemRef(argMemReg, argOffset, argRegister, op, pushToMemory[argIndex++]); + getOutgoingArgumentMemRef(sp, argOffset, argRegister, op, pushToMemory[argIndex++]); argOffset += offsetInc; } numIntegerArgs++; @@ -672,7 +678,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #error Unsupported platform #endif } - getOutgoingArgumentMemRef(argMemReg, argOffset, argRegister, op, pushToMemory[argIndex++]); + getOutgoingArgumentMemRef(sp, argOffset, argRegister, op, pushToMemory[argIndex++]); argOffset += offsetInc; } numFloatArgs++; @@ -746,17 +752,12 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen if (numMemArgs > 0) { - TR::RealRegister *sp = cg()->machine()->getRealRegister(properties.getStackPointerRegister()); - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::subimmx, callNode, argMemReg, sp, totalSize); - for (argIndex = 0; argIndex < numMemArgs; argIndex++) { TR::Register *aReg = pushToMemory[argIndex].argRegister; generateMemSrc1Instruction(cg(), pushToMemory[argIndex].opCode, callNode, pushToMemory[argIndex].argMemory, aReg); cg()->stopUsingRegister(aReg); } - - cg()->stopUsingRegister(argMemReg); } return totalSize; @@ -1058,20 +1059,7 @@ TR::Register *J9::ARM64::JNILinkage::buildDirectDispatch(TR::Node *callNode) #endif TR::RegisterDependencyConditions *deps = new (trHeapMemory()) TR::RegisterDependencyConditions(maxRegisters, maxPostRegisters, trMemory()); - size_t spSize = buildJNIArgs(callNode, deps, passThread, passReceiver, killNonVolatileGPRs); - TR::RealRegister *sp = machine()->getRealRegister(_systemLinkage->getProperties().getStackPointerRegister()); - - if (spSize > 0) - { - if (constantIsUnsignedImm12(spSize)) - { - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::subimmx, callNode, sp, sp, spSize); - } - else - { - TR_ASSERT_FATAL(false, "Too many arguments."); - } - } + buildJNIArgs(callNode, deps, passThread, passReceiver, killNonVolatileGPRs); TR::Register *returnRegister = getReturnRegisterFromDeps(callNode, deps); @@ -1111,18 +1099,6 @@ TR::Register *J9::ARM64::JNILinkage::buildDirectDispatch(TR::Node *callNode) TR::Instruction *callInstr = generateMethodDispatch(callNode, isJNIGCPoint, deps, targetAddress, x9Reg); generateLabelInstruction(cg(), TR::InstOpCode::label, callNode, returnLabel, callInstr); - if (spSize > 0) - { - if (constantIsUnsignedImm12(spSize)) - { - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::addimmx, callNode, sp, sp, spSize); - } - else - { - TR_ASSERT_FATAL(false, "Too many arguments."); - } - } - if (dropVMAccess) { #ifdef J9VM_INTERP_ATOMIC_FREE_JNI diff --git a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp index 5840292427b..14a39b11fbe 100644 --- a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp @@ -76,6 +76,9 @@ J9::ARM64::CodeGenerator::initialize() { comp->setOption(TR_EnableMonitorCacheLookup); } + + if (comp->fej9()->hasFixedFrameC_CallingConvention()) + cg->setHasFixedFrameC_CallingConvention(); } TR::Linkage * diff --git a/runtime/compiler/env/VMJ9.cpp b/runtime/compiler/env/VMJ9.cpp index af864ee872c..e2a6ef6432d 100644 --- a/runtime/compiler/env/VMJ9.cpp +++ b/runtime/compiler/env/VMJ9.cpp @@ -9654,7 +9654,7 @@ portLibCall_sysinfo_has_fixed_frame_C_calling_convention() /* AArch64 */ #if defined(TR_HOST_ARM64) && defined(TR_TARGET_ARM64) - return false; + return true; #endif /* X86 */ diff --git a/runtime/oti/arm64helpers.m4 b/runtime/oti/arm64helpers.m4 index 4ecf75cc3e4..1a12e932b98 100644 --- a/runtime/oti/arm64helpers.m4 +++ b/runtime/oti/arm64helpers.m4 @@ -75,10 +75,23 @@ define({JIT_GPR_SAVE_OFFSET},{eval(J9TR_cframe_jitGPRs+(($1)*ALen))}) define({JIT_GPR_SAVE_SLOT},{[sp,{#}JIT_GPR_SAVE_OFFSET($1)]}) define({JIT_FPR_SAVE_OFFSET},{eval(J9TR_cframe_jitFPRs+(($1)*8))}) define({JIT_FPR_SAVE_SLOT},{[sp,{#}JIT_FPR_SAVE_OFFSET($1)]}) +dnl The size of J9CInterpreterStackFrame is now larger than 504 bytes, so we cannot simply use sp as the base register for ldp/stp instructions +dnl because the offset would become too large and invalid for ldp/stp. +dnl Thus, we use sp + JIT_GPR_SAVE_OFFSET(0) as the base address for ldp/stp. We chose x16 as the base register. +dnl It is intra procedure call register and JIT does not use it, so it is safe to clobber x16. +define({JIT_GPR_SAVE_SLOT_X16},{[x16,{#}eval(($1)*ALen)]}) +define({JIT_FPR_SAVE_SLOT_X16},{[x16,{#}eval(JIT_FPR_SAVE_OFFSET($1)-JIT_GPR_SAVE_OFFSET(0))]}) -define({SAVE_FPLR},{stp x29,x30,JIT_GPR_SAVE_SLOT(29)}) +dnl JIT_GPR_SAVE_OFFSET is too large to fit in ldp/stp instructions. +define({SAVE_FPLR},{ + str x29,JIT_GPR_SAVE_SLOT(29) + str x30,JIT_GPR_SAVE_SLOT(30) +}) -define({RESTORE_FPLR},{ldp x29,x30,JIT_GPR_SAVE_SLOT(29)}) +define({RESTORE_FPLR},{ + ldr x29,JIT_GPR_SAVE_SLOT(29) + ldr x30,JIT_GPR_SAVE_SLOT(30) +}) define({BEGIN_HELPER},{ START_PROC($1) @@ -95,22 +108,23 @@ define({CALL_C_WITH_VMTHREAD},{ CALL_DIRECT($1) }) +dnl x16 is inter-procedure-call register define({SAVE_C_VOLATILE_REGS},{ - stp x0,x1,JIT_GPR_SAVE_SLOT(0) - stp x2,x3,JIT_GPR_SAVE_SLOT(2) - stp x4,x5,JIT_GPR_SAVE_SLOT(4) - stp x6,x7,JIT_GPR_SAVE_SLOT(6) - stp x8,x9,JIT_GPR_SAVE_SLOT(8) - stp x10,x11,JIT_GPR_SAVE_SLOT(10) - stp x12,x13,JIT_GPR_SAVE_SLOT(12) - stp x14,x15,JIT_GPR_SAVE_SLOT(14) - stp x16,x17,JIT_GPR_SAVE_SLOT(16) - str x18,JIT_GPR_SAVE_SLOT(18) + add x16, sp, JIT_GPR_SAVE_OFFSET(0) + stp x0,x1,JIT_GPR_SAVE_SLOT_X16(0) + stp x2,x3,JIT_GPR_SAVE_SLOT_X16(2) + stp x4,x5,JIT_GPR_SAVE_SLOT_X16(4) + stp x6,x7,JIT_GPR_SAVE_SLOT_X16(6) + stp x8,x9,JIT_GPR_SAVE_SLOT_X16(8) + stp x10,x11,JIT_GPR_SAVE_SLOT_X16(10) + stp x12,x13,JIT_GPR_SAVE_SLOT_X16(12) + stp x14,x15,JIT_GPR_SAVE_SLOT_X16(14) + stp x17,x18,JIT_GPR_SAVE_SLOT_X16(17) ifdef({METHOD_INVOCATION},{ - stp d0,d1,JIT_FPR_SAVE_SLOT(0) - stp d2,d3,JIT_FPR_SAVE_SLOT(2) - stp d4,d5,JIT_FPR_SAVE_SLOT(4) - stp d6,d7,JIT_FPR_SAVE_SLOT(6) + stp d0,d1,JIT_FPR_SAVE_SLOT_X16(0) + stp d2,d3,JIT_FPR_SAVE_SLOT_X16(2) + stp d4,d5,JIT_FPR_SAVE_SLOT_X16(4) + stp d6,d7,JIT_FPR_SAVE_SLOT_X16(6) },{ dnl METHOD_INVOCATION add x15, sp, JIT_FPR_SAVE_OFFSET(0) st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [x15], #64 @@ -125,11 +139,12 @@ ifdef({METHOD_INVOCATION},{ }) define({RESTORE_C_VOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ifdef({METHOD_INVOCATION},{ - ldp d0,d1,JIT_FPR_SAVE_SLOT(0) - ldp d2,d3,JIT_FPR_SAVE_SLOT(2) - ldp d4,d5,JIT_FPR_SAVE_SLOT(4) - ldp d6,d7,JIT_FPR_SAVE_SLOT(6) + ldp d0,d1,JIT_FPR_SAVE_SLOT_X16(0) + ldp d2,d3,JIT_FPR_SAVE_SLOT_X16(2) + ldp d4,d5,JIT_FPR_SAVE_SLOT_X16(4) + ldp d6,d7,JIT_FPR_SAVE_SLOT_X16(6) },{ dnl METHOD_INVOCATION add x15, sp, JIT_FPR_SAVE_OFFSET(0) ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [x15], #64 @@ -141,37 +156,38 @@ ifdef({METHOD_INVOCATION},{ ld1 {{v24.4s, v25.4s, v26.4s, v27.4s}}, [x15], #64 ld1 {{v28.4s, v29.4s, v30.4s, v31.4s}}, [x15] }) dnl METHOD_INVOCATION - ldp x0,x1,JIT_GPR_SAVE_SLOT(0) - ldp x2,x3,JIT_GPR_SAVE_SLOT(2) - ldp x4,x5,JIT_GPR_SAVE_SLOT(4) - ldp x6,x7,JIT_GPR_SAVE_SLOT(6) - ldp x8,x9,JIT_GPR_SAVE_SLOT(8) - ldp x10,x11,JIT_GPR_SAVE_SLOT(10) - ldp x12,x13,JIT_GPR_SAVE_SLOT(12) - ldp x14,x15,JIT_GPR_SAVE_SLOT(14) - ldp x16,x17,JIT_GPR_SAVE_SLOT(16) - ldr x18,JIT_GPR_SAVE_SLOT(18) + ldp x0,x1,JIT_GPR_SAVE_SLOT_X16(0) + ldp x2,x3,JIT_GPR_SAVE_SLOT_X16(2) + ldp x4,x5,JIT_GPR_SAVE_SLOT_X16(4) + ldp x6,x7,JIT_GPR_SAVE_SLOT_X16(6) + ldp x8,x9,JIT_GPR_SAVE_SLOT_X16(8) + ldp x10,x11,JIT_GPR_SAVE_SLOT_X16(10) + ldp x12,x13,JIT_GPR_SAVE_SLOT_X16(12) + ldp x14,x15,JIT_GPR_SAVE_SLOT_X16(14) + ldp x17,x18,JIT_GPR_SAVE_SLOT_X16(17) }) dnl No need to save/restore d8-15 - the stack walker will never need to read dnl or modify them (no preserved FPRs in the JIT private linkage). define({SAVE_C_NONVOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) str x19,JIT_GPR_SAVE_SLOT(19) - stp x20,x21,JIT_GPR_SAVE_SLOT(20) - stp x22,x23,JIT_GPR_SAVE_SLOT(22) - stp x24,x25,JIT_GPR_SAVE_SLOT(24) - stp x26,x27,JIT_GPR_SAVE_SLOT(26) - stp x28,x29,JIT_GPR_SAVE_SLOT(28) + stp x20,x21,JIT_GPR_SAVE_SLOT_X16(20) + stp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + stp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + stp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + stp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({RESTORE_C_NONVOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ldr x19,JIT_GPR_SAVE_SLOT(19) - ldp x20,x21,JIT_GPR_SAVE_SLOT(20) - ldp x22,x23,JIT_GPR_SAVE_SLOT(22) - ldp x24,x25,JIT_GPR_SAVE_SLOT(24) - ldp x26,x27,JIT_GPR_SAVE_SLOT(26) - ldp x28,x29,JIT_GPR_SAVE_SLOT(28) + ldp x20,x21,JIT_GPR_SAVE_SLOT_X16(20) + ldp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + ldp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + ldp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + ldp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({SAVE_ALL_REGS},{ @@ -185,21 +201,23 @@ define({RESTORE_ALL_REGS},{ }) define({SAVE_PRESERVED_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) str x18,JIT_GPR_SAVE_SLOT(18) str x21,JIT_GPR_SAVE_SLOT(21) - stp x22,x23,JIT_GPR_SAVE_SLOT(22) - stp x24,x25,JIT_GPR_SAVE_SLOT(24) - stp x26,x27,JIT_GPR_SAVE_SLOT(26) - stp x28,x29,JIT_GPR_SAVE_SLOT(28) + stp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + stp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + stp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + stp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({RESTORE_PRESERVED_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ldr x18,JIT_GPR_SAVE_SLOT(18) ldr x21,JIT_GPR_SAVE_SLOT(21) - ldp x22,x23,JIT_GPR_SAVE_SLOT(22) - ldp x24,x25,JIT_GPR_SAVE_SLOT(24) - ldp x26,x27,JIT_GPR_SAVE_SLOT(26) - ldp x28,x29,JIT_GPR_SAVE_SLOT(28) + ldp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + ldp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + ldp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + ldp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({BRANCH_VIA_VMTHREAD},{ diff --git a/runtime/oti/j9nonbuilder.h b/runtime/oti/j9nonbuilder.h index eae95d24728..8edb8040827 100644 --- a/runtime/oti/j9nonbuilder.h +++ b/runtime/oti/j9nonbuilder.h @@ -6291,6 +6291,7 @@ typedef struct J9CInterpreterStackFrame { U_8 jitFPRs[16 * 8]; /* fpr0-15 */ #endif /* J9VM_ENV_DATA64 */ #elif defined(J9VM_ARCH_AARCH64) /* J9VM_ARCH_ARM */ + UDATA outgoingArguments[J9_INLINE_JNI_MAX_ARG_COUNT]; UDATA preservedGPRs[12]; /* x19-x30 */ U_8 preservedFPRs[8 * 8]; /* v8-15 */ J9JITGPRSpillArea jitGPRs;