Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

n64: remove call to epilogue function from the jit #1663

Merged
merged 1 commit into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions ares/n64/cpu/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,19 @@ auto CPU::instruction() -> void {
if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) {
if(vaddrAlignedError<Word>(access.vaddr, false)) return;
auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints());
block->execute(*this);
} else {
auto data = fetch(access);
if (!data) return;
pipeline.begin();
instructionPrologue(ipu.pc, *data);
decoderEXECUTE(*data);
instructionEpilogue<0>();
pipeline.end();
if(block) {
block->execute(*this);
return;
}
}

auto data = fetch(access);
if (!data) return;
pipeline.begin();
instructionPrologue(ipu.pc, *data);
decoderEXECUTE(*data);
instructionEpilogue<0>();
pipeline.end();
}

auto CPU::instructionPrologue(u64 address, u32 instruction) -> void {
Expand All @@ -132,11 +135,7 @@ auto CPU::instructionPrologue(u64 address, u32 instruction) -> void {

template<bool Recompiled>
auto CPU::instructionEpilogue() -> void {
if constexpr(Recompiled) {
//simulates timings without performing actual icache loads
icache.step(ipu.pc, devirtualizeFast(ipu.pc));
assert(ipu.r[0].u64 == 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little concerned about losing this assert in debug builds. I guess we could replace it with a check in the JITed code. Not saying this has to be addressed in this PR, but I wanted to make note of it.

} else {
if constexpr(!Recompiled) {
ipu.r[0].u64 = 0;
}
}
Expand Down
24 changes: 22 additions & 2 deletions ares/n64/cpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ struct CPU : Thread {
}
}

auto jitFetch(u64 vaddr, u32 paddr, CPU& cpu) -> void {
auto& line = this->line(vaddr);
if(!line.hit(paddr)) {
line.fill(paddr, cpu);
}
}

//used by the interpreter to fully emulate the instruction cache
auto fetch(u64 vaddr, u32 paddr, CPU& cpu) -> u32 {
auto& line = this->line(vaddr);
Expand All @@ -135,6 +142,18 @@ struct CPU : Thread {
return line.read(paddr);
}

auto coherent(u64 vaddr, u32 paddr) -> bool {
auto& line = this->line(vaddr);
if(!line.hit(paddr))
return true;
u32 ram[8];
self.busReadBurst<ICache>(paddr & ~0x0000'0fff | line.index, ram);
for (int i=0; i<8; i++)
if (ram[i] != line.words[i])
return false;
return true;
}

auto power(bool reset) -> void {
u32 index = 0;
for(auto& line : lines) {
Expand Down Expand Up @@ -232,7 +251,6 @@ struct CPU : Thread {
auto load(u64 vaddr, bool noExceptions = false) -> PhysAccess;
auto load(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe<PhysAccess>;

auto loadFast(u64 vaddr) -> PhysAccess;
auto store(u64 vaddr, bool noExceptions = false) -> PhysAccess;
auto store(u64 vaddr, const Entry& entry, bool noExceptions = false) -> maybe<PhysAccess>;

Expand Down Expand Up @@ -279,10 +297,12 @@ struct CPU : Thread {

auto segment(u64 vaddr) -> Context::Segment;
template<u32 Dir, u32 Size> auto devirtualize(u64 vaddr, bool raiseAlignedError = true, bool raiseExceptions = true) -> PhysAccess;
alwaysinline auto devirtualizeFast(u64 vaddr) -> u64;
auto devirtualizeDebug(u64 vaddr) -> u64;

auto fetch(PhysAccess access) -> maybe<u32>;
auto jitFetch(u64 vaddr, u32 addr) -> void {
icache.jitFetch(vaddr, addr, *this);
}
template<u32 Size> auto busWrite(u32 address, u64 data) -> void;
template<u32 Size> auto busRead(u32 address) -> u64;
template<u32 Size> auto busWriteBurst(u32 address, u32 *data) -> void;
Expand Down
33 changes: 1 addition & 32 deletions ares/n64/cpu/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,39 +122,8 @@ auto CPU::devirtualize(u64 vaddr, bool raiseAlignedError, bool raiseExceptions)
unreachable;
}

// Fast(er) version of devirtualize for icache lookups
// avoids handling unmapped regions/exceptions as these should have already
// been handled by instruction fetch, also ignores tlb match failure
auto CPU::devirtualizeFast(u64 vaddr) -> u64 {
// Assume address space is mapped into pages that are 4kb in size
// If we have a cached physical address for this page, use it
// This cache is purged on any writes to the TLB so should never become stale
auto vbase = vaddr >> 12;
if(devirtualizeCache.vbase == vbase && devirtualizeCache.pbase) {
auto offset = vaddr & 0xfff;
return (devirtualizeCache.pbase & ~0xfff) + offset;
}

// Cache the physical address of this page for the next call
devirtualizeCache.vbase = vaddr >> 12;

switch(segment(vaddr)) {
case Context::Segment::Mapped: {
auto match = tlb.loadFast(vaddr);
return devirtualizeCache.pbase = match.paddr & context.physMask;
}
case Context::Segment::Cached:
case Context::Segment::Direct:
return devirtualizeCache.pbase = vaddr & 0x1fff'ffff;
case Context::Segment::Cached32:
case Context::Segment::Direct32:
return devirtualizeCache.pbase = vaddr & 0xffff'ffff;
}
return devirtualizeCache.pbase = 0;
}

auto CPU::devirtualizeDebug(u64 vaddr) -> u64 {
return devirtualizeFast(vaddr); // this wrapper preserves the inlining of 'devirtualizeFast'
return devirtualize<Read, Byte>(vaddr, false).paddr; // this wrapper preserves the inlining of 'devirtualizeFast'
}

template<u32 Size>
Expand Down
30 changes: 26 additions & 4 deletions ares/n64/cpu/recompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* {
auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block;
auto block = emit(vaddr, address, singleInstruction);
pool(address)->blocks[address >> 2 & 0x3f] = block;
memory::jitprotect(true);
if(block) {
pool(address)->blocks[address >> 2 & 0x3f] = block;
memory::jitprotect(true);
}
return block;
}

Expand All @@ -28,11 +30,16 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
reset();
}

auto block = (Block*)allocator.acquire(sizeof(Block));
// abort compilation of block asap if the instruction cache is not coherent
if(!self.icache.coherent(vaddr, address))
return nullptr;

bool abort = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

abort is unused

beginFunction(3);

Thread thread;
bool hasBranched = 0;
int numInsn = 0;
constexpr u32 branchToSelf = 0x1000'ffff; //beq 0,0,<pc>
u32 jumpToSelf = 2 << 26 | vaddr >> 2 & 0x3ff'ffff; //j <pc>
while(true) {
Expand All @@ -46,13 +53,26 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
mov32(reg(2), imm(instruction));
call(&CPU::instructionPrologue);
}
if(numInsn == 0 || (vaddr&0x1f)==0){
//abort compilation of block if the instruction cache is not coherent
if(!self.icache.coherent(vaddr, address)) {
resetCompiler();
return nullptr;
}
mov64(reg(1), imm(vaddr));
mov32(reg(2), imm(address));
call(&CPU::jitFetch);
}
numInsn++;
bool branched = emitEXECUTE(instruction);
if(unlikely(instruction == branchToSelf || instruction == jumpToSelf)) {
//accelerate idle loops
mov32(reg(1), imm(64 * 2));
call(&CPU::step);
} else {
mov32(reg(1), imm(1 * 2));
call(&CPU::step);
}
call(&CPU::instructionEpilogue<1>);
test32(PipelineReg(state), imm(Pipeline::EndBlock), set_z);
mov32(PipelineReg(state), PipelineReg(nstate));
mov64(mem(IpuReg(pc)), PipelineReg(pc));
Expand All @@ -64,9 +84,11 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
hasBranched = branched;
jumpEpilog(flag_nz);
}

jumpEpilog();

memory::jitprotect(false);
auto block = (Block*)allocator.acquire(sizeof(Block));
block->code = endFunction();

//print(hex(PC, 8L), " ", instructions, " ", size(), "\n");
Expand Down
16 changes: 0 additions & 16 deletions ares/n64/cpu/tlb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ auto CPU::TLB::load(u64 vaddr, bool noExceptions) -> PhysAccess {
return {false};
}

// Fast(er) version of load for recompiler icache lookups
// avoids exceptions/debug checks
auto CPU::TLB::loadFast(u64 vaddr) -> PhysAccess {
for(auto& entry : this->entry) {
if(!entry.globals && entry.addressSpaceID != self.scc.tlb.addressSpaceID) continue;
if((vaddr & entry.addressMaskHi) != entry.virtualAddress) continue;
if(vaddr >> 62 != entry.region) continue;
bool lo = vaddr & entry.addressSelect;
if(!entry.valid[lo]) return { false, 0, 0 };
physicalAddress = entry.physicalAddress[lo] + (vaddr & entry.addressMaskLo);
return {true, entry.cacheAlgorithm[lo] != 2, physicalAddress, vaddr};
}

return {false, 0, 0};
}

auto CPU::TLB::store(u64 vaddr, const Entry& entry, bool noExceptions) -> maybe<PhysAccess> {
if(!entry.globals && entry.addressSpaceID != self.scc.tlb.addressSpaceID) return nothing;
if((vaddr & entry.addressMaskHi) != entry.virtualAddress) return nothing;
Expand Down
Loading