diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 0b976eb1c8..1248b1ab01 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -351,8 +351,10 @@ void Jit64::Shutdown() void Jit64::FallBackToInterpreter(UGeckoInstruction inst) { FlushCarry(); - gpr.Flush(BitSet32(0xFFFFFFFF), RegCache::IgnoreDiscardedRegisters::Yes); - fpr.Flush(BitSet32(0xFFFFFFFF), RegCache::IgnoreDiscardedRegisters::Yes); + gpr.Flush(BitSet32(0xFFFFFFFF), RegCache::FlushMode::Full, + RegCache::IgnoreDiscardedRegisters::Yes); + fpr.Flush(BitSet32(0xFFFFFFFF), RegCache::FlushMode::Full, + RegCache::IgnoreDiscardedRegisters::Yes); if (js.op->canEndBlock) { @@ -1151,7 +1153,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) // output, which needs to be bound in the actual instruction compilation. // TODO: make this smarter in the case that we're actually register-starved, i.e. // prioritize the more important registers. - gpr.PreloadRegisters(op.regsIn & op.gprInUse & ~op.gprDiscardable); + gpr.PreloadRegisters(op.regsIn & (op.gprWillBeRead | op.gprWillBeWritten) & + ~op.gprDiscardable); fpr.PreloadRegisters(op.fregsIn & op.fprInXmm & ~op.fprDiscardable); } @@ -1230,8 +1233,12 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) gpr.Discard(op.gprDiscardable); fpr.Discard(op.fprDiscardable); } - gpr.Flush(~op.gprInUse & (op.regsIn | op.regsOut)); - fpr.Flush(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); + gpr.Flush(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut), + RegCache::FlushMode::Full); + fpr.Flush(~(op.fprWillBeRead | op.fprWillBeWritten) & (op.fregsIn | op.GetFregsOut()), + RegCache::FlushMode::Full); + gpr.Flush(~op.gprWillBeWritten & op.regsOut, RegCache::FlushMode::Undirty); + fpr.Flush(~op.fprWillBeWritten & op.GetFregsOut(), RegCache::FlushMode::Undirty); if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index f3c329cef4..70f61d8b9a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -194,7 +194,10 @@ void Jit64::ComputeRC(preg_t preg, bool needs_test, bool needs_sext) // We don't want to do this if a test is needed though, because it would interrupt macro-op // fusion. arg.Unlock(); - gpr.Flush(~js.op->gprInUse); + gpr.Flush(~(js.op->gprWillBeRead | js.op->gprWillBeWritten) & + (js.op->regsIn | js.op->regsOut), + RegCache::FlushMode::Full); + gpr.Flush(~js.op->gprWillBeWritten & js.op->regsOut, RegCache::FlushMode::Undirty); } DoMergedBranchCondition(); } diff --git a/Source/Core/Core/PowerPC/Jit64/RegCache/GPRRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/RegCache/GPRRegCache.cpp index a740d76e3d..fdbe13e192 100644 --- a/Source/Core/Core/PowerPC/Jit64/RegCache/GPRRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/RegCache/GPRRegCache.cpp @@ -115,7 +115,7 @@ void GPRRegCache::SetImmediate32(preg_t preg, u32 imm_value, bool dirty) BitSet32 GPRRegCache::GetRegUtilization() const { - return m_jit.js.op->gprInUse; + return m_jit.js.op->gprWillBeRead | m_jit.js.op->gprWillBeWritten; } BitSet32 GPRRegCache::CountRegsIn(preg_t preg, u32 lookahead) const diff --git a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp index 55c3a3acea..592813011d 100644 --- a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.cpp @@ -330,7 +330,8 @@ void RegCache::Discard(BitSet32 pregs) } } -void RegCache::Flush(BitSet32 pregs, IgnoreDiscardedRegisters ignore_discarded_registers) +void RegCache::Flush(BitSet32 pregs, FlushMode mode, + IgnoreDiscardedRegisters ignore_discarded_registers) { ASSERT_MSG(DYNA_REC, std::ranges::none_of(m_xregs, &X64CachedReg::IsLocked), "Someone forgot to unlock a X64 reg"); @@ -342,7 +343,7 @@ void RegCache::Flush(BitSet32 pregs, IgnoreDiscardedRegisters ignore_discarded_r ASSERT_MSG(DYNA_REC, !m_regs[i].IsRevertable(), "Register transaction is in progress for {}!", i); - StoreFromRegister(i, FlushMode::Full, ignore_discarded_registers); + StoreFromRegister(i, mode, ignore_discarded_registers); } } @@ -615,7 +616,7 @@ void RegCache::Realize(preg_t preg) if (m_constraints[preg].ShouldBeRevertable()) { - StoreFromRegister(preg, FlushMode::MaintainState); + StoreFromRegister(preg, FlushMode::Undirty); do_bind(); m_regs[preg].SetRevertable(); return; diff --git a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h index 41bdeb08c3..7a692a2b63 100644 --- a/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/RegCache/JitRegCache.h @@ -121,8 +121,11 @@ class RegCache public: enum class FlushMode { + // All dirty registers get written back, and all registers get removed from the cache. Full, - MaintainState, + // All dirty registers get written back and get set as no longer dirty. + // No registers are removed from the cache. + Undirty, }; enum class IgnoreDiscardedRegisters @@ -175,7 +178,7 @@ public: RCForkGuard Fork(); void Discard(BitSet32 pregs); - void Flush(BitSet32 pregs = BitSet32::AllTrue(32), + void Flush(BitSet32 pregs = BitSet32::AllTrue(32), FlushMode mode = FlushMode::Full, IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No); void Reset(BitSet32 pregs); void Revert(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index ce5dd96c74..ae656ec4d2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -257,8 +257,8 @@ void JitArm64::Shutdown() void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) { FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes); if (js.op->canEndBlock) { @@ -322,8 +322,8 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) void JitArm64::HLEFunction(u32 hook_index) { FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); ABI_CallFunction(&HLE::ExecuteFromJIT, js.compilerPC, hook_index, &m_system); } @@ -456,8 +456,8 @@ void JitArm64::MSRUpdated(u32 msr) // Call PageTableUpdatedFromJit if needed if (UReg_MSR(msr).DR) { - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); auto WA = gpr.GetScopedReg(); @@ -497,8 +497,8 @@ void JitArm64::MSRUpdated(ARM64Reg msr) // Call PageTableUpdatedFromJit if needed MOV(WA, msr); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); FixupBranch dr_unset = TBZ(WA, dr_bit); static_assert(PPCSTATE_OFF(pagetable_update_pending) < 0x1000); LDRB(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(pagetable_update_pending)); @@ -1311,8 +1311,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) m_system.GetPowerPC().GetBreakPoints().IsAddressBreakPoint(op.address)) { FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); static_assert(PPCSTATE_OFF(pc) <= 252); static_assert(PPCSTATE_OFF(pc) + 4 == PPCSTATE_OFF(npc)); @@ -1371,8 +1371,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) if (bJITRegisterCacheOff) { FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); m_constant_propagation.Clear(); CompileInstruction(op); @@ -1418,9 +1418,16 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) fpr.DiscardRegisters(op.fprDiscardable); gpr.DiscardCRRegisters(op.crDiscardable); } - gpr.StoreRegisters(~op.gprInUse & (op.regsIn | op.regsOut)); - fpr.StoreRegisters(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); - gpr.StoreCRRegisters(~op.crInUse & (op.crIn | op.crOut)); + gpr.FlushRegisters(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut), + FlushMode::Full); + fpr.FlushRegisters(~(op.fprWillBeRead | op.fprWillBeWritten) & + (op.fregsIn | op.GetFregsOut()), + FlushMode::Full); + gpr.FlushCRRegisters(~(op.crWillBeRead | op.crWillBeWritten) & (op.crIn | op.crOut), + FlushMode::Full); + gpr.FlushRegisters(~op.gprWillBeWritten & op.regsOut, FlushMode::Undirty); + fpr.FlushRegisters(~op.fprWillBeWritten & op.GetFregsOut(), FlushMode::Undirty); + gpr.FlushCRRegisters(~op.crWillBeWritten & op.crOut, FlushMode::Undirty); if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; @@ -1435,8 +1442,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) if (code_block.m_broken) { - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); WriteExit(nextPC); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 0c11dcbc3a..8368a7cc91 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -322,8 +322,8 @@ void JitArm64::FlushPPCStateBeforeSlowAccess(ARM64Reg temp_gpr, ARM64Reg temp_fp MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks(); if (mem_checks.HasAny()) { - gpr.StoreRegisters(mem_checks.GetGPRsUsedInConditions(), temp_gpr, FlushMode::MaintainState); - fpr.StoreRegisters(mem_checks.GetFPRsUsedInConditions(), temp_fpr, FlushMode::MaintainState); + gpr.FlushRegisters(mem_checks.GetGPRsUsedInConditions(), FlushMode::MaintainState, temp_gpr); + fpr.FlushRegisters(mem_checks.GetFPRsUsedInConditions(), FlushMode::MaintainState, temp_fpr); } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 191c423876..8f391538c5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -21,8 +21,8 @@ void JitArm64::sc(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITBranchOff); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); { auto WA = gpr.GetScopedReg(); @@ -39,8 +39,8 @@ void JitArm64::rfi(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITBranchOff); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); // See Interpreter rfi for details const u32 mask = 0x87C0FFFF; @@ -140,8 +140,8 @@ void JitArm64::bx(UGeckoInstruction inst) return; } - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); if (js.op->branchIsIdleLoop) { @@ -243,8 +243,8 @@ void JitArm64::bcx(UGeckoInstruction inst) if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { - gpr.Flush(FlushMode::All, WA); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, WA); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); if (IsBranchWatchEnabled()) { WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, {}, {}); @@ -275,8 +275,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst) // BO_2 == 1z1zz -> b always // NPC = CTR & 0xfffffffc; - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); Arm64GPRCache::ScopedARM64Reg WB = ARM64Reg::INVALID_REG; if (inst.LK_3) @@ -345,8 +345,8 @@ void JitArm64::bclrx(UGeckoInstruction inst) STR(IndexType::Unsigned, WB, PPC_REG, PPCSTATE_OFF_SPR(SPR_LR)); } - gpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, WB); - fpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::Full, WB); + fpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::Full, ARM64Reg::INVALID_REG); if (IsBranchWatchEnabled()) { @@ -390,8 +390,8 @@ void JitArm64::bclrx(UGeckoInstruction inst) if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { - gpr.Flush(FlushMode::All, WA); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, WA); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); if (IsBranchWatchEnabled()) { WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, {}, {}); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 8c80151e91..13e98aae79 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -559,26 +559,11 @@ void JitArm64::lmw(UGeckoInstruction inst) } } - BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d); - if (!js.op->gprInUse[a]) - { - if (!a_is_addr_base_reg) - { - gprs_to_flush[a] = true; - } - else - { - gprs_to_flush[a] = false; + BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten & BitSet32(0xFFFFFFFFU << d); - if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0) - { - // In this situation, we can save one store instruction by flushing GPR d together with GPR - // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's - // also wait with flushing GPR d until the end of the PPC instruction. - gprs_to_flush[d] = false; - } - } - } + BitSet32 gprs_to_flush = ~(js.op->gprWillBeWritten | js.op->gprWillBeRead); + if (a_is_addr_base_reg) + gprs_to_flush[a] = false; // TODO: This doesn't handle rollback on DSI correctly constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32; @@ -614,18 +599,20 @@ void JitArm64::lmw(UGeckoInstruction inst) { gpr.DiscardRegisters(BitSet32{int(i)}); } - else if (gprs_to_flush[i]) + else if (gprs_to_undirty[i]) { - BitSet32 gprs_to_flush_this_time{}; - if (i != 0 && gprs_to_flush[i - 1]) - gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)}; - else if (i == 31 || !gprs_to_flush[i + 1]) - gprs_to_flush_this_time = BitSet32{int(i)}; + BitSet32 gprs_to_undirty_this_time{}; + if (i != 0 && gprs_to_undirty[i - 1]) + gprs_to_undirty_this_time = BitSet32{int(i - 1), int(i)}; + else if (i == 31 || !gprs_to_undirty[i + 1]) + gprs_to_undirty_this_time = BitSet32{int(i)}; else continue; - gpr.StoreRegisters(gprs_to_flush_this_time); - gprs_to_flush &= ~gprs_to_flush_this_time; + gpr.FlushRegisters(gprs_to_undirty_this_time, FlushMode::Undirty, ARM64Reg::INVALID_REG); + gpr.FlushRegisters(gprs_to_undirty_this_time & gprs_to_flush, FlushMode::Full, + ARM64Reg::INVALID_REG); + gprs_to_undirty &= ~gprs_to_undirty_this_time; } } @@ -677,27 +664,7 @@ void JitArm64::stmw(UGeckoInstruction inst) } } - const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs(); - BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s); - if (dirty_gprs_to_flush_unmasked[a]) - { - if (!a_is_addr_base_reg) - { - dirty_gprs_to_flush[a] = true; - } - else - { - dirty_gprs_to_flush[a] = false; - - if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0) - { - // In this situation, we can save one store instruction by flushing GPR s together with GPR - // a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's - // also wait with flushing GPR s until the end of the PPC instruction. - dirty_gprs_to_flush[s] = false; - } - } - } + const BitSet32 gprs_to_flush = ~(js.op->gprWillBeRead | js.op->gprWillBeWritten); // TODO: This doesn't handle rollback on DSI correctly constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32; @@ -720,34 +687,12 @@ void JitArm64::stmw(UGeckoInstruction inst) EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use, fprs_in_use); - // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores - // after this instruction, flush registers that would be flushed after this instruction anyway. - // - // We try to store two registers at a time when possible to let the register cache use STP. + // To reduce register pressure, flush registers that would be flushed after this instruction + // anyway. if (gprs_to_discard[i]) - { gpr.DiscardRegisters(BitSet32{int(i)}); - } - else if (dirty_gprs_to_flush[i]) - { - BitSet32 gprs_to_flush_this_time{}; - if (i != 0 && dirty_gprs_to_flush[i - 1]) - gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)}; - else if (i == 31 || !dirty_gprs_to_flush[i + 1]) - gprs_to_flush_this_time = BitSet32{int(i)}; - else - continue; - - gpr.StoreRegisters(gprs_to_flush_this_time); - dirty_gprs_to_flush &= ~gprs_to_flush_this_time; - } - else if (!js.op->gprInUse[i]) - { - // If this register can be flushed but it isn't dirty, no store instruction will be emitted - // when flushing it, so it doesn't matter if we flush it together with another register or - // not. Let's just flush it in the simplest way possible. - gpr.StoreRegisters(BitSet32{int(i)}); - } + else if (gprs_to_flush[i]) + gpr.FlushRegisters(BitSet32{int(i)}, FlushMode::Full, ARM64Reg::INVALID_REG); } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 308c14fdde..de967cf85a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -121,7 +121,7 @@ void Arm64RegCache::FlushMostStaleRegister() } } - FlushRegister(most_stale_preg, FlushMode::All, ARM64Reg::INVALID_REG); + FlushRegister(most_stale_preg, FlushMode::Full, ARM64Reg::INVALID_REG); } void Arm64RegCache::DiscardRegister(size_t preg) @@ -203,11 +203,15 @@ void Arm64GPRCache::FlushRegister(size_t index, FlushMode mode, ARM64Reg tmp_reg if (!reg.IsInPPCState()) m_emit->STR(IndexType::Unsigned, host_reg, PPC_REG, u32(guest_reg.ppc_offset)); - if (mode == FlushMode::All) + if (mode == FlushMode::Full) { UnlockRegister(EncodeRegTo32(host_reg)); reg.Flush(); } + else if (mode == FlushMode::Undirty) + { + reg.SetDirty(false); + } } else if (is_gpr && IsImm(index - GUEST_GPR_OFFSET)) { @@ -244,8 +248,10 @@ void Arm64GPRCache::FlushRegister(size_t index, FlushMode mode, ARM64Reg tmp_reg } } - if (mode == FlushMode::All) + if (mode == FlushMode::Full) reg.Flush(); + else if (mode == FlushMode::Undirty) + reg.SetDirty(false); } } @@ -270,10 +276,10 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r const bool reg2_imm = IsImm(i + 1); const bool reg1_zero = reg1_imm && GetImm(i) == 0; const bool reg2_zero = reg2_imm && GetImm(i + 1) == 0; - const bool flush_all = mode == FlushMode::All; + const bool can_allocate_reg = mode != FlushMode::MaintainState; if (!reg1.IsInPPCState() && !reg2.IsInPPCState() && - (reg1.IsInHostRegister() || (reg1_imm && (reg1_zero || flush_all))) && - (reg2.IsInHostRegister() || (reg2_imm && (reg2_zero || flush_all)))) + (reg1.IsInHostRegister() || (reg1_imm && (reg1_zero || can_allocate_reg))) && + (reg2.IsInHostRegister() || (reg2_imm && (reg2_zero || can_allocate_reg)))) { const size_t ppc_offset = GetGuestByIndex(i).ppc_offset; if (ppc_offset <= 252) @@ -281,7 +287,7 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r ARM64Reg RX1 = reg1_zero ? ARM64Reg::WZR : BindForRead(i); ARM64Reg RX2 = reg2_zero ? ARM64Reg::WZR : BindForRead(i + 1); m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset)); - if (flush_all) + if (mode == FlushMode::Full) { if (reg1.IsInHostRegister()) UnlockRegister(reg1.GetReg()); @@ -290,6 +296,11 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r reg1.Flush(); reg2.Flush(); } + else if (mode == FlushMode::Undirty) + { + reg1.SetDirty(false); + reg2.SetDirty(false); + } ++iter; continue; } @@ -497,7 +508,7 @@ void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg) const OpArg& reg = m_guest_registers[i]; if (reg.IsInHostRegister() && DecodeReg(reg.GetReg()) == DecodeReg(host_reg)) { - FlushRegister(i, FlushMode::All, tmp_reg); + FlushRegister(i, FlushMode::Full, tmp_reg); return; } } @@ -788,7 +799,7 @@ void Arm64FPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg) if (reg.IsInHostRegister() && reg.GetReg() == host_reg) { - FlushRegister(i, FlushMode::All, tmp_reg); + FlushRegister(i, FlushMode::Full, tmp_reg); return; } } @@ -817,6 +828,22 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg) const bool dirty = !reg.IsInPPCState(); RegType type = reg.GetFPRType(); + if (mode == FlushMode::Undirty) + { + switch (type) + { + case RegType::Single: + case RegType::DuplicatedSingle: + case RegType::LowerPairSingle: + // In this situation, skip flushing. It's usually better to wait until later instead to avoid + // extra conversions. We can revisit this decision in the future if the register cache gets + // the ability to store both the single and double versions of a value simultaneously. + return; + default: + break; + } + } + bool allocated_tmp_reg = false; if (tmp_reg != ARM64Reg::INVALID_REG) { @@ -868,11 +895,15 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg) static_cast(PPCSTATE_OFF_PS0(preg))); } - if (mode == FlushMode::All) + if (mode == FlushMode::Full) { UnlockRegister(host_reg); reg.Flush(); } + else if (mode == FlushMode::Undirty) + { + reg.SetDirty(false); + } } else if (type == RegType::Duplicated) { @@ -892,11 +923,15 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg) } } - if (mode == FlushMode::All) + if (mode == FlushMode::Full) { UnlockRegister(host_reg); reg.Flush(); } + else if (mode == FlushMode::Undirty) + { + reg.SetDirty(false); + } } if (allocated_tmp_reg) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index cde4fbe6cb..7f16630acd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -68,13 +68,17 @@ enum class RegType DuplicatedSingle, // PS0 and PS1 are identical, host register only stores one lane (32-bit) }; -enum class FlushMode : bool +enum class FlushMode { - // Flushes all registers, no exceptions - All, - // Flushes registers in a conditional branch - // Doesn't wipe the state of the registers from the cache + // All dirty registers get written back, and all registers get removed from the cache. + Full, + // All dirty registers get written back, but the state of the cache is untouched. + // The host registers may get clobbered. This is intended for use when doing a block exit + // after a conditional branch. MaintainState, + // Most dirty registers get written back and get set as no longer dirty. + // No registers are removed from the cache. + Undirty, }; enum class IgnoreDiscardedRegisters @@ -379,17 +383,15 @@ public: BitSet32 GetDirtyGPRs() const; - void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, - FlushMode flush_mode = FlushMode::All) - { - FlushRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No); - } + void FlushRegisters( + BitSet32 regs, FlushMode flush_mode = FlushMode::Full, + Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, + IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No); - void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, - FlushMode flush_mode = FlushMode::All) - { - FlushCRRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No); - } + void FlushCRRegisters( + BitSet8 regs, FlushMode flush_mode = FlushMode::Full, + Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, + IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No); void DiscardCRRegisters(BitSet8 regs); void ResetCRRegisters(BitSet8 regs); @@ -436,11 +438,6 @@ private: void SetImmediateInternal(size_t index, u32 imm, bool dirty); void BindForWrite(size_t index, bool will_read, bool will_write = true); - void FlushRegisters(BitSet32 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg, - IgnoreDiscardedRegisters ignore_discarded_registers); - void FlushCRRegisters(BitSet8 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg, - IgnoreDiscardedRegisters ignore_discarded_registers); - static constexpr size_t GUEST_GPR_COUNT = 32; static constexpr size_t GUEST_CR_COUNT = 8; static constexpr size_t GUEST_GPR_OFFSET = 0; @@ -470,11 +467,8 @@ public: void FixSinglePrecision(size_t preg); - void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG, - FlushMode flush_mode = FlushMode::All) - { - FlushRegisters(regs, flush_mode, tmp_reg); - } + void FlushRegisters(BitSet32 regs, FlushMode flush_mode = FlushMode::Full, + Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG); protected: // Get the order of the host registers @@ -489,6 +483,4 @@ protected: private: bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const; bool IsTopHalfUsed(Arm64Gen::ARM64Reg reg) const; - - void FlushRegisters(BitSet32 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg); }; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index 21a0003887..30f80f4dff 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -227,8 +227,8 @@ void JitArm64::mtmsr(UGeckoInstruction inst) if (!imm_value) MSRUpdated(gpr.R(inst.RS)); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); WriteExceptionExit(js.compilerPC + 4, true); } @@ -367,8 +367,8 @@ void JitArm64::twx(UGeckoInstruction inst) if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { - gpr.Flush(FlushMode::All, WA); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + gpr.Flush(FlushMode::Full, WA); + fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG); WriteExit(js.compilerPC + 4); } } @@ -714,15 +714,12 @@ void JitArm64::mfcr(UGeckoInstruction inst) CMP(CR, ARM64Reg::ZR); CSEL(WA, WC, WA, CC_GT); - // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores - // after this instruction, flush registers that would be flushed after this instruction anyway. - // - // There's no point in ensuring we flush two registers at the same time, because the offset in - // ppcState for CRs is too large to be encoded into an STP instruction. + // To reduce register pressure, flush registers that would be flushed after this instruction + // anyway. if (js.op->crDiscardable[i]) gpr.DiscardCRRegisters(BitSet8{i}); - else if (!js.op->crInUse[i]) - gpr.StoreCRRegisters(BitSet8{i}, WC); + else if (!(js.op->crWillBeRead | js.op->crWillBeWritten)[i]) + gpr.FlushCRRegisters(BitSet8{i}, FlushMode::Full, WC); } } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 40b974b0d8..6ecb9c3262 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -984,8 +984,9 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, // wants flags, to be safe. bool wantsFPRF = true; bool wantsCA = true; - BitSet8 crInUse, crDiscardable; - BitSet32 gprBlockInputs, gprInUse, fprInUse, gprDiscardable, fprDiscardable, fprInXmm; + BitSet8 crWillBeRead, crWillBeWritten, crDiscardable; + BitSet32 gprWillBeRead, gprWillBeWritten, fprWillBeRead, fprWillBeWritten, gprDiscardable, + fprDiscardable, fprInXmm; for (int i = block->m_num_instructions - 1; i >= 0; i--) { CodeOp& op = code[i]; @@ -1012,28 +1013,38 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, wantsCA |= opWantsCA || may_exit_block; wantsFPRF &= !op.outputFPRF || opWantsFPRF; wantsCA &= !op.outputCA || opWantsCA; - op.gprInUse = gprInUse; - op.fprInUse = fprInUse; - op.crInUse = crInUse; + op.gprWillBeRead = gprWillBeRead; + op.gprWillBeWritten = gprWillBeWritten; + op.fprWillBeRead = fprWillBeRead; + op.fprWillBeWritten = fprWillBeWritten; + op.crWillBeRead = crWillBeRead; + op.crWillBeWritten = crWillBeWritten; op.gprDiscardable = gprDiscardable; op.fprDiscardable = fprDiscardable; op.crDiscardable = crDiscardable; op.fprInXmm = fprInXmm; - gprBlockInputs &= ~op.regsOut; - gprBlockInputs |= op.regsIn; - gprInUse |= op.regsIn | op.regsOut; - fprInUse |= op.fregsIn | op.GetFregsOut(); - crInUse |= op.crIn | op.crOut; + gprWillBeRead &= ~op.regsOut; + gprWillBeRead |= op.regsIn; + gprWillBeWritten |= op.regsOut; + fprWillBeRead &= ~op.GetFregsOut(); + fprWillBeRead |= op.fregsIn; + fprWillBeWritten |= op.GetFregsOut(); + crWillBeRead &= ~op.crOut; + crWillBeRead |= op.crIn; + crWillBeWritten |= op.crOut; if (strncmp(op.opinfo->opname, "stfd", 4)) fprInXmm |= op.fregsIn; if (hle || breakpoint) { - gprInUse = BitSet32{}; - fprInUse = BitSet32{}; + gprWillBeRead = BitSet32{}; + gprWillBeWritten = BitSet32{}; + fprWillBeRead = BitSet32{}; + fprWillBeWritten = BitSet32{}; fprInXmm = BitSet32{}; - crInUse = BitSet8{}; + crWillBeRead = BitSet8{}; + crWillBeWritten = BitSet8{}; gprDiscardable = BitSet32{}; fprDiscardable = BitSet32{}; crDiscardable = BitSet8{}; @@ -1149,7 +1160,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, } block->m_gqr_used = gqrUsed; block->m_gqr_modified = gqrModified; - block->m_gpr_inputs = gprBlockInputs; + block->m_gpr_inputs = gprWillBeRead; return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 879bd06111..f9081bae93 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -51,11 +51,14 @@ struct CodeOp // 16B bool canCauseException = false; bool skipLRStack = false; bool skip = false; // followed BL-s for example - BitSet8 crInUse; + BitSet8 crWillBeRead; + BitSet8 crWillBeWritten; BitSet8 crDiscardable; // which registers are still needed after this instruction in this block - BitSet32 fprInUse; - BitSet32 gprInUse; + BitSet32 gprWillBeRead; + BitSet32 gprWillBeWritten; + BitSet32 fprWillBeRead; + BitSet32 fprWillBeWritten; // which registers have values which are known to be unused after this instruction BitSet32 gprDiscardable; BitSet32 fprDiscardable;