From 272d01d197959f865c1cb90f6740f334cf44b3f2 Mon Sep 17 00:00:00 2001 From: Giovanni Bajo Date: Sun, 6 Oct 2024 23:05:39 +0200 Subject: [PATCH] n64: remove call to epilogue function from the jit --- ares/n64/cpu/cpu.cpp | 27 +++++++++++++-------------- ares/n64/cpu/cpu.hpp | 24 ++++++++++++++++++++++-- ares/n64/cpu/memory.cpp | 33 +-------------------------------- ares/n64/cpu/recompiler.cpp | 30 ++++++++++++++++++++++++++---- ares/n64/cpu/tlb.cpp | 16 ---------------- 5 files changed, 62 insertions(+), 68 deletions(-) diff --git a/ares/n64/cpu/cpu.cpp b/ares/n64/cpu/cpu.cpp index 29f03ce916..fd5ed0b827 100644 --- a/ares/n64/cpu/cpu.cpp +++ b/ares/n64/cpu/cpu.cpp @@ -114,16 +114,19 @@ auto CPU::instruction() -> void { if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) { if(vaddrAlignedError(access.vaddr, false)) return; auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints()); - block->execute(*this); - } else { - auto data = fetch(access); - if (!data) return; - pipeline.begin(); - instructionPrologue(ipu.pc, *data); - decoderEXECUTE(*data); - instructionEpilogue<0>(); - pipeline.end(); + if(block) { + block->execute(*this); + return; + } } + + auto data = fetch(access); + if (!data) return; + pipeline.begin(); + instructionPrologue(ipu.pc, *data); + decoderEXECUTE(*data); + instructionEpilogue<0>(); + pipeline.end(); } auto CPU::instructionPrologue(u64 address, u32 instruction) -> void { @@ -132,11 +135,7 @@ auto CPU::instructionPrologue(u64 address, u32 instruction) -> void { template auto CPU::instructionEpilogue() -> void { - if constexpr(Recompiled) { - //simulates timings without performing actual icache loads - icache.step(ipu.pc, devirtualizeFast(ipu.pc)); - assert(ipu.r[0].u64 == 0); - } else { + if constexpr(!Recompiled) { ipu.r[0].u64 = 0; } } diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp index bddb7222a1..6025ed62d1 100644 --- a/ares/n64/cpu/cpu.hpp +++ b/ares/n64/cpu/cpu.hpp @@ -126,6 +126,13 @@ struct CPU : Thread { } } + auto jitFetch(u64 vaddr, u32 paddr, CPU& cpu) -> void { + auto& line = this->line(vaddr); + if(!line.hit(paddr)) { + line.fill(paddr, cpu); + } + } + //used by the interpreter to fully emulate the instruction cache auto fetch(u64 vaddr, u32 paddr, CPU& cpu) -> u32 { auto& line = this->line(vaddr); @@ -135,6 +142,18 @@ struct CPU : Thread { return line.read(paddr); } + auto coherent(u64 vaddr, u32 paddr) -> bool { + auto& line = this->line(vaddr); + if(!line.hit(paddr)) + return true; + u32 ram[8]; + self.busReadBurst(paddr & ~0x0000'0fff | line.index, ram); + for (int i=0; i<8; i++) + if (ram[i] != line.words[i]) + return false; + return true; + } + auto power(bool reset) -> void { u32 index = 0; for(auto& line : lines) { @@ -232,7 +251,6 @@ struct CPU : Thread { auto load(u64 vaddr, bool noExceptions = false) -> PhysAccess; auto load(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe; - auto loadFast(u64 vaddr) -> PhysAccess; auto store(u64 vaddr, bool noExceptions = false) -> PhysAccess; auto store(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe; @@ -279,10 +297,12 @@ struct CPU : Thread { auto segment(u64 vaddr) -> Context::Segment; template auto devirtualize(u64 vaddr, bool raiseAlignedError = true, bool raiseExceptions = true) -> PhysAccess; - alwaysinline auto devirtualizeFast(u64 vaddr) -> u64; auto devirtualizeDebug(u64 vaddr) -> u64; auto fetch(PhysAccess access) -> maybe; + auto jitFetch(u64 vaddr, u32 addr) -> void { + icache.jitFetch(vaddr, addr, *this); + } template auto busWrite(u32 address, u64 data) -> void; template auto busRead(u32 address) -> u64; template auto busWriteBurst(u32 address, u32 *data) -> void; diff --git a/ares/n64/cpu/memory.cpp b/ares/n64/cpu/memory.cpp index 44fbc720ec..672d72cc28 100644 --- a/ares/n64/cpu/memory.cpp +++ b/ares/n64/cpu/memory.cpp @@ -122,39 +122,8 @@ auto CPU::devirtualize(u64 vaddr, bool raiseAlignedError, bool raiseExceptions) unreachable; } -// Fast(er) version of devirtualize for icache lookups -// avoids handling unmapped regions/exceptions as these should have already -// been handled by instruction fetch, also ignores tlb match failure -auto CPU::devirtualizeFast(u64 vaddr) -> u64 { - // Assume address space is mapped into pages that are 4kb in size - // If we have a cached physical address for this page, use it - // This cache is purged on any writes to the TLB so should never become stale - auto vbase = vaddr >> 12; - if(devirtualizeCache.vbase == vbase && devirtualizeCache.pbase) { - auto offset = vaddr & 0xfff; - return (devirtualizeCache.pbase & ~0xfff) + offset; - } - - // Cache the physical address of this page for the next call - devirtualizeCache.vbase = vaddr >> 12; - - switch(segment(vaddr)) { - case Context::Segment::Mapped: { - auto match = tlb.loadFast(vaddr); - return devirtualizeCache.pbase = match.paddr & context.physMask; - } - case Context::Segment::Cached: - case Context::Segment::Direct: - return devirtualizeCache.pbase = vaddr & 0x1fff'ffff; - case Context::Segment::Cached32: - case Context::Segment::Direct32: - return devirtualizeCache.pbase = vaddr & 0xffff'ffff; - } - return devirtualizeCache.pbase = 0; -} - auto CPU::devirtualizeDebug(u64 vaddr) -> u64 { - return devirtualizeFast(vaddr); // this wrapper preserves the inlining of 'devirtualizeFast' + return devirtualize(vaddr, false).paddr; // this wrapper preserves the inlining of 'devirtualizeFast' } template diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp index 68a6962243..05a66941d8 100644 --- a/ares/n64/cpu/recompiler.cpp +++ b/ares/n64/cpu/recompiler.cpp @@ -12,8 +12,10 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* { auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* { if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block; auto block = emit(vaddr, address, singleInstruction); - pool(address)->blocks[address >> 2 & 0x3f] = block; - memory::jitprotect(true); + if(block) { + pool(address)->blocks[address >> 2 & 0x3f] = block; + memory::jitprotect(true); + } return block; } @@ -28,11 +30,16 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl reset(); } - auto block = (Block*)allocator.acquire(sizeof(Block)); + // abort compilation of block asap if the instruction cache is not coherent + if(!self.icache.coherent(vaddr, address)) + return nullptr; + + bool abort = false; beginFunction(3); Thread thread; bool hasBranched = 0; + int numInsn = 0; constexpr u32 branchToSelf = 0x1000'ffff; //beq 0,0, u32 jumpToSelf = 2 << 26 | vaddr >> 2 & 0x3ff'ffff; //j while(true) { @@ -46,13 +53,26 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl mov32(reg(2), imm(instruction)); call(&CPU::instructionPrologue); } + if(numInsn == 0 || (vaddr&0x1f)==0){ + //abort compilation of block if the instruction cache is not coherent + if(!self.icache.coherent(vaddr, address)) { + resetCompiler(); + return nullptr; + } + mov64(reg(1), imm(vaddr)); + mov32(reg(2), imm(address)); + call(&CPU::jitFetch); + } + numInsn++; bool branched = emitEXECUTE(instruction); if(unlikely(instruction == branchToSelf || instruction == jumpToSelf)) { //accelerate idle loops mov32(reg(1), imm(64 * 2)); call(&CPU::step); + } else { + mov32(reg(1), imm(1 * 2)); + call(&CPU::step); } - call(&CPU::instructionEpilogue<1>); test32(PipelineReg(state), imm(Pipeline::EndBlock), set_z); mov32(PipelineReg(state), PipelineReg(nstate)); mov64(mem(IpuReg(pc)), PipelineReg(pc)); @@ -64,9 +84,11 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl hasBranched = branched; jumpEpilog(flag_nz); } + jumpEpilog(); memory::jitprotect(false); + auto block = (Block*)allocator.acquire(sizeof(Block)); block->code = endFunction(); //print(hex(PC, 8L), " ", instructions, " ", size(), "\n"); diff --git a/ares/n64/cpu/tlb.cpp b/ares/n64/cpu/tlb.cpp index 088ecdb8a4..3bec71ea10 100644 --- a/ares/n64/cpu/tlb.cpp +++ b/ares/n64/cpu/tlb.cpp @@ -41,22 +41,6 @@ auto CPU::TLB::load(u64 vaddr, bool noExceptions) -> PhysAccess { return {false}; } -// Fast(er) version of load for recompiler icache lookups -// avoids exceptions/debug checks -auto CPU::TLB::loadFast(u64 vaddr) -> PhysAccess { - for(auto& entry : this->entry) { - if(!entry.globals && entry.addressSpaceID != self.scc.tlb.addressSpaceID) continue; - if((vaddr & entry.addressMaskHi) != entry.virtualAddress) continue; - if(vaddr >> 62 != entry.region) continue; - bool lo = vaddr & entry.addressSelect; - if(!entry.valid[lo]) return { false, 0, 0 }; - physicalAddress = entry.physicalAddress[lo] + (vaddr & entry.addressMaskLo); - return {true, entry.cacheAlgorithm[lo] != 2, physicalAddress, vaddr}; - } - - return {false, 0, 0}; -} - auto CPU::TLB::store(u64 vaddr, const Entry& entry, bool noExceptions) -> maybe { if(!entry.globals && entry.addressSpaceID != self.scc.tlb.addressSpaceID) return nothing; if((vaddr & entry.addressMaskHi) != entry.virtualAddress) return nothing;