PPCAnalyst: Split "in use" analysis into reads and writes

If the last write to a register comes before the last read of it,
we can write the register to ppcState after the last write instead of
after the last read. This will hopefully help spread out m_ppc_state
writes across a code block, improving pipelining. Also, if there's a
conditional branch that's after the last write but before the last read,
instead of needing to emit one m_ppc_state write on each side of the
branch, we now only need to emit one m_ppc_state write.

A note about the changes made to stmw and mfcr: These instructions don't
write to any GPRs or CRs respectively – they only read from them.
With this commit, there are no longer any cases where registers get
written back to m_ppc_state after an instruction that just reads from
them, so we can get rid of all STP logic from these two instructions.
lmw still needs its STP logic, since that one does write to registers.
This commit is contained in:
JosJuice 2023-11-04 17:48:57 +01:00
parent ffa03fec78
commit 2fe7e109aa
14 changed files with 189 additions and 185 deletions

View File

@ -351,8 +351,10 @@ void Jit64::Shutdown()
void Jit64::FallBackToInterpreter(UGeckoInstruction inst)
{
FlushCarry();
gpr.Flush(BitSet32(0xFFFFFFFF), RegCache::IgnoreDiscardedRegisters::Yes);
fpr.Flush(BitSet32(0xFFFFFFFF), RegCache::IgnoreDiscardedRegisters::Yes);
gpr.Flush(BitSet32(0xFFFFFFFF), RegCache::FlushMode::Full,
RegCache::IgnoreDiscardedRegisters::Yes);
fpr.Flush(BitSet32(0xFFFFFFFF), RegCache::FlushMode::Full,
RegCache::IgnoreDiscardedRegisters::Yes);
if (js.op->canEndBlock)
{
@ -1151,7 +1153,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
// output, which needs to be bound in the actual instruction compilation.
// TODO: make this smarter in the case that we're actually register-starved, i.e.
// prioritize the more important registers.
gpr.PreloadRegisters(op.regsIn & op.gprInUse & ~op.gprDiscardable);
gpr.PreloadRegisters(op.regsIn & (op.gprWillBeRead | op.gprWillBeWritten) &
~op.gprDiscardable);
fpr.PreloadRegisters(op.fregsIn & op.fprInXmm & ~op.fprDiscardable);
}
@ -1230,8 +1233,12 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
gpr.Discard(op.gprDiscardable);
fpr.Discard(op.fprDiscardable);
}
gpr.Flush(~op.gprInUse & (op.regsIn | op.regsOut));
fpr.Flush(~op.fprInUse & (op.fregsIn | op.GetFregsOut()));
gpr.Flush(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut),
RegCache::FlushMode::Full);
fpr.Flush(~(op.fprWillBeRead | op.fprWillBeWritten) & (op.fregsIn | op.GetFregsOut()),
RegCache::FlushMode::Full);
gpr.Flush(~op.gprWillBeWritten & op.regsOut, RegCache::FlushMode::Undirty);
fpr.Flush(~op.fprWillBeWritten & op.GetFregsOut(), RegCache::FlushMode::Undirty);
if (opinfo->flags & FL_LOADSTORE)
++js.numLoadStoreInst;

View File

@ -194,7 +194,10 @@ void Jit64::ComputeRC(preg_t preg, bool needs_test, bool needs_sext)
// We don't want to do this if a test is needed though, because it would interrupt macro-op
// fusion.
arg.Unlock();
gpr.Flush(~js.op->gprInUse);
gpr.Flush(~(js.op->gprWillBeRead | js.op->gprWillBeWritten) &
(js.op->regsIn | js.op->regsOut),
RegCache::FlushMode::Full);
gpr.Flush(~js.op->gprWillBeWritten & js.op->regsOut, RegCache::FlushMode::Undirty);
}
DoMergedBranchCondition();
}

View File

@ -115,7 +115,7 @@ void GPRRegCache::SetImmediate32(preg_t preg, u32 imm_value, bool dirty)
BitSet32 GPRRegCache::GetRegUtilization() const
{
return m_jit.js.op->gprInUse;
return m_jit.js.op->gprWillBeRead | m_jit.js.op->gprWillBeWritten;
}
BitSet32 GPRRegCache::CountRegsIn(preg_t preg, u32 lookahead) const

View File

@ -330,7 +330,8 @@ void RegCache::Discard(BitSet32 pregs)
}
}
void RegCache::Flush(BitSet32 pregs, IgnoreDiscardedRegisters ignore_discarded_registers)
void RegCache::Flush(BitSet32 pregs, FlushMode mode,
IgnoreDiscardedRegisters ignore_discarded_registers)
{
ASSERT_MSG(DYNA_REC, std::ranges::none_of(m_xregs, &X64CachedReg::IsLocked),
"Someone forgot to unlock a X64 reg");
@ -342,7 +343,7 @@ void RegCache::Flush(BitSet32 pregs, IgnoreDiscardedRegisters ignore_discarded_r
ASSERT_MSG(DYNA_REC, !m_regs[i].IsRevertable(), "Register transaction is in progress for {}!",
i);
StoreFromRegister(i, FlushMode::Full, ignore_discarded_registers);
StoreFromRegister(i, mode, ignore_discarded_registers);
}
}
@ -615,7 +616,7 @@ void RegCache::Realize(preg_t preg)
if (m_constraints[preg].ShouldBeRevertable())
{
StoreFromRegister(preg, FlushMode::MaintainState);
StoreFromRegister(preg, FlushMode::Undirty);
do_bind();
m_regs[preg].SetRevertable();
return;

View File

@ -121,8 +121,11 @@ class RegCache
public:
enum class FlushMode
{
// All dirty registers get written back, and all registers get removed from the cache.
Full,
MaintainState,
// All dirty registers get written back and get set as no longer dirty.
// No registers are removed from the cache.
Undirty,
};
enum class IgnoreDiscardedRegisters
@ -175,7 +178,7 @@ public:
RCForkGuard Fork();
void Discard(BitSet32 pregs);
void Flush(BitSet32 pregs = BitSet32::AllTrue(32),
void Flush(BitSet32 pregs = BitSet32::AllTrue(32), FlushMode mode = FlushMode::Full,
IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No);
void Reset(BitSet32 pregs);
void Revert();

View File

@ -257,8 +257,8 @@ void JitArm64::Shutdown()
void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
{
FlushCarry();
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG, IgnoreDiscardedRegisters::Yes);
if (js.op->canEndBlock)
{
@ -322,8 +322,8 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
void JitArm64::HLEFunction(u32 hook_index)
{
FlushCarry();
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
ABI_CallFunction(&HLE::ExecuteFromJIT, js.compilerPC, hook_index, &m_system);
}
@ -456,8 +456,8 @@ void JitArm64::MSRUpdated(u32 msr)
// Call PageTableUpdatedFromJit if needed
if (UReg_MSR(msr).DR)
{
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
auto WA = gpr.GetScopedReg();
@ -497,8 +497,8 @@ void JitArm64::MSRUpdated(ARM64Reg msr)
// Call PageTableUpdatedFromJit if needed
MOV(WA, msr);
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
FixupBranch dr_unset = TBZ(WA, dr_bit);
static_assert(PPCSTATE_OFF(pagetable_update_pending) < 0x1000);
LDRB(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(pagetable_update_pending));
@ -1311,8 +1311,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
m_system.GetPowerPC().GetBreakPoints().IsAddressBreakPoint(op.address))
{
FlushCarry();
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
static_assert(PPCSTATE_OFF(pc) <= 252);
static_assert(PPCSTATE_OFF(pc) + 4 == PPCSTATE_OFF(npc));
@ -1371,8 +1371,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
if (bJITRegisterCacheOff)
{
FlushCarry();
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
m_constant_propagation.Clear();
CompileInstruction(op);
@ -1418,9 +1418,16 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
fpr.DiscardRegisters(op.fprDiscardable);
gpr.DiscardCRRegisters(op.crDiscardable);
}
gpr.StoreRegisters(~op.gprInUse & (op.regsIn | op.regsOut));
fpr.StoreRegisters(~op.fprInUse & (op.fregsIn | op.GetFregsOut()));
gpr.StoreCRRegisters(~op.crInUse & (op.crIn | op.crOut));
gpr.FlushRegisters(~(op.gprWillBeRead | op.gprWillBeWritten) & (op.regsIn | op.regsOut),
FlushMode::Full);
fpr.FlushRegisters(~(op.fprWillBeRead | op.fprWillBeWritten) &
(op.fregsIn | op.GetFregsOut()),
FlushMode::Full);
gpr.FlushCRRegisters(~(op.crWillBeRead | op.crWillBeWritten) & (op.crIn | op.crOut),
FlushMode::Full);
gpr.FlushRegisters(~op.gprWillBeWritten & op.regsOut, FlushMode::Undirty);
fpr.FlushRegisters(~op.fprWillBeWritten & op.GetFregsOut(), FlushMode::Undirty);
gpr.FlushCRRegisters(~op.crWillBeWritten & op.crOut, FlushMode::Undirty);
if (opinfo->flags & FL_LOADSTORE)
++js.numLoadStoreInst;
@ -1435,8 +1442,8 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
if (code_block.m_broken)
{
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
WriteExit(nextPC);
}

View File

@ -322,8 +322,8 @@ void JitArm64::FlushPPCStateBeforeSlowAccess(ARM64Reg temp_gpr, ARM64Reg temp_fp
MemChecks& mem_checks = m_system.GetPowerPC().GetMemChecks();
if (mem_checks.HasAny())
{
gpr.StoreRegisters(mem_checks.GetGPRsUsedInConditions(), temp_gpr, FlushMode::MaintainState);
fpr.StoreRegisters(mem_checks.GetFPRsUsedInConditions(), temp_fpr, FlushMode::MaintainState);
gpr.FlushRegisters(mem_checks.GetGPRsUsedInConditions(), FlushMode::MaintainState, temp_gpr);
fpr.FlushRegisters(mem_checks.GetFPRsUsedInConditions(), FlushMode::MaintainState, temp_fpr);
}
}

View File

@ -21,8 +21,8 @@ void JitArm64::sc(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff);
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
{
auto WA = gpr.GetScopedReg();
@ -39,8 +39,8 @@ void JitArm64::rfi(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff);
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
// See Interpreter rfi for details
const u32 mask = 0x87C0FFFF;
@ -140,8 +140,8 @@ void JitArm64::bx(UGeckoInstruction inst)
return;
}
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
if (js.op->branchIsIdleLoop)
{
@ -243,8 +243,8 @@ void JitArm64::bcx(UGeckoInstruction inst)
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
{
gpr.Flush(FlushMode::All, WA);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, WA);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
if (IsBranchWatchEnabled())
{
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, {}, {});
@ -275,8 +275,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
// BO_2 == 1z1zz -> b always
// NPC = CTR & 0xfffffffc;
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
Arm64GPRCache::ScopedARM64Reg WB = ARM64Reg::INVALID_REG;
if (inst.LK_3)
@ -345,8 +345,8 @@ void JitArm64::bclrx(UGeckoInstruction inst)
STR(IndexType::Unsigned, WB, PPC_REG, PPCSTATE_OFF_SPR(SPR_LR));
}
gpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, WB);
fpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::Full, WB);
fpr.Flush(conditional ? FlushMode::MaintainState : FlushMode::Full, ARM64Reg::INVALID_REG);
if (IsBranchWatchEnabled())
{
@ -390,8 +390,8 @@ void JitArm64::bclrx(UGeckoInstruction inst)
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
{
gpr.Flush(FlushMode::All, WA);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, WA);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
if (IsBranchWatchEnabled())
{
WriteBranchWatch<false>(js.compilerPC, js.compilerPC + 4, inst, {}, {});

View File

@ -559,26 +559,11 @@ void JitArm64::lmw(UGeckoInstruction inst)
}
}
BitSet32 gprs_to_flush = ~js.op->gprInUse & BitSet32(0xFFFFFFFFU << d);
if (!js.op->gprInUse[a])
{
if (!a_is_addr_base_reg)
{
gprs_to_flush[a] = true;
}
else
{
gprs_to_flush[a] = false;
BitSet32 gprs_to_undirty = ~js.op->gprWillBeWritten & BitSet32(0xFFFFFFFFU << d);
if (a + 1 == d && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
{
// In this situation, we can save one store instruction by flushing GPR d together with GPR
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
// also wait with flushing GPR d until the end of the PPC instruction.
gprs_to_flush[d] = false;
}
}
}
BitSet32 gprs_to_flush = ~(js.op->gprWillBeWritten | js.op->gprWillBeRead);
if (a_is_addr_base_reg)
gprs_to_flush[a] = false;
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
@ -614,18 +599,20 @@ void JitArm64::lmw(UGeckoInstruction inst)
{
gpr.DiscardRegisters(BitSet32{int(i)});
}
else if (gprs_to_flush[i])
else if (gprs_to_undirty[i])
{
BitSet32 gprs_to_flush_this_time{};
if (i != 0 && gprs_to_flush[i - 1])
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !gprs_to_flush[i + 1])
gprs_to_flush_this_time = BitSet32{int(i)};
BitSet32 gprs_to_undirty_this_time{};
if (i != 0 && gprs_to_undirty[i - 1])
gprs_to_undirty_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !gprs_to_undirty[i + 1])
gprs_to_undirty_this_time = BitSet32{int(i)};
else
continue;
gpr.StoreRegisters(gprs_to_flush_this_time);
gprs_to_flush &= ~gprs_to_flush_this_time;
gpr.FlushRegisters(gprs_to_undirty_this_time, FlushMode::Undirty, ARM64Reg::INVALID_REG);
gpr.FlushRegisters(gprs_to_undirty_this_time & gprs_to_flush, FlushMode::Full,
ARM64Reg::INVALID_REG);
gprs_to_undirty &= ~gprs_to_undirty_this_time;
}
}
@ -677,27 +664,7 @@ void JitArm64::stmw(UGeckoInstruction inst)
}
}
const BitSet32 dirty_gprs_to_flush_unmasked = ~js.op->gprInUse & gpr.GetDirtyGPRs();
BitSet32 dirty_gprs_to_flush = dirty_gprs_to_flush_unmasked & BitSet32(0xFFFFFFFFU << s);
if (dirty_gprs_to_flush_unmasked[a])
{
if (!a_is_addr_base_reg)
{
dirty_gprs_to_flush[a] = true;
}
else
{
dirty_gprs_to_flush[a] = false;
if (a + 1 == s && (std::countr_one((~js.op->gprInUse).m_val >> a) & 1) == 0)
{
// In this situation, we can save one store instruction by flushing GPR s together with GPR
// a, but we shouldn't flush GPR a until the end of the PPC instruction. Therefore, let's
// also wait with flushing GPR s until the end of the PPC instruction.
dirty_gprs_to_flush[s] = false;
}
}
}
const BitSet32 gprs_to_flush = ~(js.op->gprWillBeRead | js.op->gprWillBeWritten);
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
@ -720,34 +687,12 @@ void JitArm64::stmw(UGeckoInstruction inst)
EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use,
fprs_in_use);
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
// after this instruction, flush registers that would be flushed after this instruction anyway.
//
// We try to store two registers at a time when possible to let the register cache use STP.
// To reduce register pressure, flush registers that would be flushed after this instruction
// anyway.
if (gprs_to_discard[i])
{
gpr.DiscardRegisters(BitSet32{int(i)});
}
else if (dirty_gprs_to_flush[i])
{
BitSet32 gprs_to_flush_this_time{};
if (i != 0 && dirty_gprs_to_flush[i - 1])
gprs_to_flush_this_time = BitSet32{int(i - 1), int(i)};
else if (i == 31 || !dirty_gprs_to_flush[i + 1])
gprs_to_flush_this_time = BitSet32{int(i)};
else
continue;
gpr.StoreRegisters(gprs_to_flush_this_time);
dirty_gprs_to_flush &= ~gprs_to_flush_this_time;
}
else if (!js.op->gprInUse[i])
{
// If this register can be flushed but it isn't dirty, no store instruction will be emitted
// when flushing it, so it doesn't matter if we flush it together with another register or
// not. Let's just flush it in the simplest way possible.
gpr.StoreRegisters(BitSet32{int(i)});
}
else if (gprs_to_flush[i])
gpr.FlushRegisters(BitSet32{int(i)}, FlushMode::Full, ARM64Reg::INVALID_REG);
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);

View File

@ -121,7 +121,7 @@ void Arm64RegCache::FlushMostStaleRegister()
}
}
FlushRegister(most_stale_preg, FlushMode::All, ARM64Reg::INVALID_REG);
FlushRegister(most_stale_preg, FlushMode::Full, ARM64Reg::INVALID_REG);
}
void Arm64RegCache::DiscardRegister(size_t preg)
@ -203,11 +203,15 @@ void Arm64GPRCache::FlushRegister(size_t index, FlushMode mode, ARM64Reg tmp_reg
if (!reg.IsInPPCState())
m_emit->STR(IndexType::Unsigned, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
if (mode == FlushMode::All)
if (mode == FlushMode::Full)
{
UnlockRegister(EncodeRegTo32(host_reg));
reg.Flush();
}
else if (mode == FlushMode::Undirty)
{
reg.SetDirty(false);
}
}
else if (is_gpr && IsImm(index - GUEST_GPR_OFFSET))
{
@ -244,8 +248,10 @@ void Arm64GPRCache::FlushRegister(size_t index, FlushMode mode, ARM64Reg tmp_reg
}
}
if (mode == FlushMode::All)
if (mode == FlushMode::Full)
reg.Flush();
else if (mode == FlushMode::Undirty)
reg.SetDirty(false);
}
}
@ -270,10 +276,10 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r
const bool reg2_imm = IsImm(i + 1);
const bool reg1_zero = reg1_imm && GetImm(i) == 0;
const bool reg2_zero = reg2_imm && GetImm(i + 1) == 0;
const bool flush_all = mode == FlushMode::All;
const bool can_allocate_reg = mode != FlushMode::MaintainState;
if (!reg1.IsInPPCState() && !reg2.IsInPPCState() &&
(reg1.IsInHostRegister() || (reg1_imm && (reg1_zero || flush_all))) &&
(reg2.IsInHostRegister() || (reg2_imm && (reg2_zero || flush_all))))
(reg1.IsInHostRegister() || (reg1_imm && (reg1_zero || can_allocate_reg))) &&
(reg2.IsInHostRegister() || (reg2_imm && (reg2_zero || can_allocate_reg))))
{
const size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
if (ppc_offset <= 252)
@ -281,7 +287,7 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r
ARM64Reg RX1 = reg1_zero ? ARM64Reg::WZR : BindForRead(i);
ARM64Reg RX2 = reg2_zero ? ARM64Reg::WZR : BindForRead(i + 1);
m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset));
if (flush_all)
if (mode == FlushMode::Full)
{
if (reg1.IsInHostRegister())
UnlockRegister(reg1.GetReg());
@ -290,6 +296,11 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, FlushMode mode, ARM64Reg tmp_r
reg1.Flush();
reg2.Flush();
}
else if (mode == FlushMode::Undirty)
{
reg1.SetDirty(false);
reg2.SetDirty(false);
}
++iter;
continue;
}
@ -497,7 +508,7 @@ void Arm64GPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
const OpArg& reg = m_guest_registers[i];
if (reg.IsInHostRegister() && DecodeReg(reg.GetReg()) == DecodeReg(host_reg))
{
FlushRegister(i, FlushMode::All, tmp_reg);
FlushRegister(i, FlushMode::Full, tmp_reg);
return;
}
}
@ -788,7 +799,7 @@ void Arm64FPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
if (reg.IsInHostRegister() && reg.GetReg() == host_reg)
{
FlushRegister(i, FlushMode::All, tmp_reg);
FlushRegister(i, FlushMode::Full, tmp_reg);
return;
}
}
@ -817,6 +828,22 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg)
const bool dirty = !reg.IsInPPCState();
RegType type = reg.GetFPRType();
if (mode == FlushMode::Undirty)
{
switch (type)
{
case RegType::Single:
case RegType::DuplicatedSingle:
case RegType::LowerPairSingle:
// In this situation, skip flushing. It's usually better to wait until later instead to avoid
// extra conversions. We can revisit this decision in the future if the register cache gets
// the ability to store both the single and double versions of a value simultaneously.
return;
default:
break;
}
}
bool allocated_tmp_reg = false;
if (tmp_reg != ARM64Reg::INVALID_REG)
{
@ -868,11 +895,15 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg)
static_cast<s32>(PPCSTATE_OFF_PS0(preg)));
}
if (mode == FlushMode::All)
if (mode == FlushMode::Full)
{
UnlockRegister(host_reg);
reg.Flush();
}
else if (mode == FlushMode::Undirty)
{
reg.SetDirty(false);
}
}
else if (type == RegType::Duplicated)
{
@ -892,11 +923,15 @@ void Arm64FPRCache::FlushRegister(size_t preg, FlushMode mode, ARM64Reg tmp_reg)
}
}
if (mode == FlushMode::All)
if (mode == FlushMode::Full)
{
UnlockRegister(host_reg);
reg.Flush();
}
else if (mode == FlushMode::Undirty)
{
reg.SetDirty(false);
}
}
if (allocated_tmp_reg)

View File

@ -68,13 +68,17 @@ enum class RegType
DuplicatedSingle, // PS0 and PS1 are identical, host register only stores one lane (32-bit)
};
enum class FlushMode : bool
enum class FlushMode
{
// Flushes all registers, no exceptions
All,
// Flushes registers in a conditional branch
// Doesn't wipe the state of the registers from the cache
// All dirty registers get written back, and all registers get removed from the cache.
Full,
// All dirty registers get written back, but the state of the cache is untouched.
// The host registers may get clobbered. This is intended for use when doing a block exit
// after a conditional branch.
MaintainState,
// Most dirty registers get written back and get set as no longer dirty.
// No registers are removed from the cache.
Undirty,
};
enum class IgnoreDiscardedRegisters
@ -379,17 +383,15 @@ public:
BitSet32 GetDirtyGPRs() const;
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No);
}
void FlushRegisters(
BitSet32 regs, FlushMode flush_mode = FlushMode::Full,
Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No);
void StoreCRRegisters(BitSet8 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushCRRegisters(regs, flush_mode, tmp_reg, IgnoreDiscardedRegisters::No);
}
void FlushCRRegisters(
BitSet8 regs, FlushMode flush_mode = FlushMode::Full,
Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
IgnoreDiscardedRegisters ignore_discarded_registers = IgnoreDiscardedRegisters::No);
void DiscardCRRegisters(BitSet8 regs);
void ResetCRRegisters(BitSet8 regs);
@ -436,11 +438,6 @@ private:
void SetImmediateInternal(size_t index, u32 imm, bool dirty);
void BindForWrite(size_t index, bool will_read, bool will_write = true);
void FlushRegisters(BitSet32 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg,
IgnoreDiscardedRegisters ignore_discarded_registers);
void FlushCRRegisters(BitSet8 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg,
IgnoreDiscardedRegisters ignore_discarded_registers);
static constexpr size_t GUEST_GPR_COUNT = 32;
static constexpr size_t GUEST_CR_COUNT = 8;
static constexpr size_t GUEST_GPR_OFFSET = 0;
@ -470,11 +467,8 @@ public:
void FixSinglePrecision(size_t preg);
void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG,
FlushMode flush_mode = FlushMode::All)
{
FlushRegisters(regs, flush_mode, tmp_reg);
}
void FlushRegisters(BitSet32 regs, FlushMode flush_mode = FlushMode::Full,
Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG);
protected:
// Get the order of the host registers
@ -489,6 +483,4 @@ protected:
private:
bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const;
bool IsTopHalfUsed(Arm64Gen::ARM64Reg reg) const;
void FlushRegisters(BitSet32 regs, FlushMode mode, Arm64Gen::ARM64Reg tmp_reg);
};

View File

@ -227,8 +227,8 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
if (!imm_value)
MSRUpdated(gpr.R(inst.RS));
gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
WriteExceptionExit(js.compilerPC + 4, true);
}
@ -367,8 +367,8 @@ void JitArm64::twx(UGeckoInstruction inst)
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
{
gpr.Flush(FlushMode::All, WA);
fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG);
gpr.Flush(FlushMode::Full, WA);
fpr.Flush(FlushMode::Full, ARM64Reg::INVALID_REG);
WriteExit(js.compilerPC + 4);
}
}
@ -714,15 +714,12 @@ void JitArm64::mfcr(UGeckoInstruction inst)
CMP(CR, ARM64Reg::ZR);
CSEL(WA, WC, WA, CC_GT);
// To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores
// after this instruction, flush registers that would be flushed after this instruction anyway.
//
// There's no point in ensuring we flush two registers at the same time, because the offset in
// ppcState for CRs is too large to be encoded into an STP instruction.
// To reduce register pressure, flush registers that would be flushed after this instruction
// anyway.
if (js.op->crDiscardable[i])
gpr.DiscardCRRegisters(BitSet8{i});
else if (!js.op->crInUse[i])
gpr.StoreCRRegisters(BitSet8{i}, WC);
else if (!(js.op->crWillBeRead | js.op->crWillBeWritten)[i])
gpr.FlushCRRegisters(BitSet8{i}, FlushMode::Full, WC);
}
}

View File

@ -983,8 +983,9 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
// wants flags, to be safe.
bool wantsFPRF = true;
bool wantsCA = true;
BitSet8 crInUse, crDiscardable;
BitSet32 gprBlockInputs, gprInUse, fprInUse, gprDiscardable, fprDiscardable, fprInXmm;
BitSet8 crWillBeRead, crWillBeWritten, crDiscardable;
BitSet32 gprWillBeRead, gprWillBeWritten, fprWillBeRead, fprWillBeWritten, gprDiscardable,
fprDiscardable, fprInXmm;
for (int i = block->m_num_instructions - 1; i >= 0; i--)
{
CodeOp& op = code[i];
@ -1011,28 +1012,38 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
wantsCA |= opWantsCA || may_exit_block;
wantsFPRF &= !op.outputFPRF || opWantsFPRF;
wantsCA &= !op.outputCA || opWantsCA;
op.gprInUse = gprInUse;
op.fprInUse = fprInUse;
op.crInUse = crInUse;
op.gprWillBeRead = gprWillBeRead;
op.gprWillBeWritten = gprWillBeWritten;
op.fprWillBeRead = fprWillBeRead;
op.fprWillBeWritten = fprWillBeWritten;
op.crWillBeRead = crWillBeRead;
op.crWillBeWritten = crWillBeWritten;
op.gprDiscardable = gprDiscardable;
op.fprDiscardable = fprDiscardable;
op.crDiscardable = crDiscardable;
op.fprInXmm = fprInXmm;
gprBlockInputs &= ~op.regsOut;
gprBlockInputs |= op.regsIn;
gprInUse |= op.regsIn | op.regsOut;
fprInUse |= op.fregsIn | op.GetFregsOut();
crInUse |= op.crIn | op.crOut;
gprWillBeRead &= ~op.regsOut;
gprWillBeRead |= op.regsIn;
gprWillBeWritten |= op.regsOut;
fprWillBeRead &= ~op.GetFregsOut();
fprWillBeRead |= op.fregsIn;
fprWillBeWritten |= op.GetFregsOut();
crWillBeRead &= ~op.crOut;
crWillBeRead |= op.crIn;
crWillBeWritten |= op.crOut;
if (strncmp(op.opinfo->opname, "stfd", 4))
fprInXmm |= op.fregsIn;
if (hle || breakpoint)
{
gprInUse = BitSet32{};
fprInUse = BitSet32{};
gprWillBeRead = BitSet32{};
gprWillBeWritten = BitSet32{};
fprWillBeRead = BitSet32{};
fprWillBeWritten = BitSet32{};
fprInXmm = BitSet32{};
crInUse = BitSet8{};
crWillBeRead = BitSet8{};
crWillBeWritten = BitSet8{};
gprDiscardable = BitSet32{};
fprDiscardable = BitSet32{};
crDiscardable = BitSet8{};
@ -1148,7 +1159,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer,
}
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
block->m_gpr_inputs = gprBlockInputs;
block->m_gpr_inputs = gprWillBeRead;
return address;
}

View File

@ -51,11 +51,14 @@ struct CodeOp // 16B
bool canCauseException = false;
bool skipLRStack = false;
bool skip = false; // followed BL-s for example
BitSet8 crInUse;
BitSet8 crWillBeRead;
BitSet8 crWillBeWritten;
BitSet8 crDiscardable;
// which registers are still needed after this instruction in this block
BitSet32 fprInUse;
BitSet32 gprInUse;
BitSet32 gprWillBeRead;
BitSet32 gprWillBeWritten;
BitSet32 fprWillBeRead;
BitSet32 fprWillBeWritten;
// which registers have values which are known to be unused after this instruction
BitSet32 gprDiscardable;
BitSet32 fprDiscardable;