From 616b3b6e12e1f005887980979374b98ef2ba3b00 Mon Sep 17 00:00:00 2001
From: tykkiman <tykkimies@protonmail.com>
Date: Sun, 8 Sep 2024 17:48:17 +0300
Subject: [PATCH] n64: access compiled blocks via a hash

We'd like the recompiler to take the execution context such as kernel
mode into account when compiling blocks. That's why it's necessary to
identify blocks not just by address but all the information used at
compile time. This is done by computing a 32-bit key and using that as
a block's identifier instead of the last six physical address bits like
was done before.

Since we have now 32-bit instead of 6-bit keys, the block() function
hashes the keys before mapping them to one of the 64 pool rows. The hash
function was chosen arbitrarily to be better than a simple multiplicative
hash and is likely not the best choice for this exact task.
---
 ares/n64/cpu/cpu.cpp        | 10 ++++++++-
 ares/n64/cpu/cpu.hpp        | 21 +++++++++++++++---
 ares/n64/cpu/recompiler.cpp | 43 +++++++++++++++++++++++++++++++------
 3 files changed, 64 insertions(+), 10 deletions(-)
diff --git a/ares/n64/cpu/cpu.cpp b/ares/n64/cpu/cpu.cpp
index 29f03ce916..e7fbf92336 100644
--- a/ares/n64/cpu/cpu.cpp
+++ b/ares/n64/cpu/cpu.cpp
@@ -113,7 +113,15 @@ auto CPU::instruction() -> void {
 
   if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) {
     if(vaddrAlignedError<Word>(access.vaddr, false)) return;
-    auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints());
+    auto block = recompiler.block(ipu.pc, access.paddr,
+        {
+        .singleInstruction = GDB::server.hasBreakpoints(),
+        .endian = Context::Endian(context.endian),
+        .mode = Context::Mode(context.mode),
+        .cop1Enabled = scc.status.enable.coprocessor1 > 0,
+        .floatingPointMode = scc.status.floatingPointMode > 0,
+        .is64bit = context.bits == 64,
+        });
     block->execute(*this);
   } else {
     auto data = fetch(access);
diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp
index bddb7222a1..700fbe8882 100644
--- a/ares/n64/cpu/cpu.hpp
+++ b/ares/n64/cpu/cpu.hpp
@@ -863,7 +863,20 @@ struct CPU : Thread {
     };
 
     struct Pool {
-      Block* blocks[1 << 6];
+      struct Row {
+        Block* block;
+        u32 tag;
+      };
+      Row rows[1 << 6];
+    };
+
+    struct JITContext {
+      bool singleInstruction;
+      Context::Endian endian;
+      Context::Mode mode;
+      bool cop1Enabled;
+      bool floatingPointMode;
+      bool is64bit;
     };
 
     auto reset() -> void {
@@ -899,9 +912,11 @@ struct CPU : Thread {
     }
 
     auto pool(u32 address) -> Pool*;
-    auto block(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto computePoolKey(u32 address, JITContext ctx) -> u32;
+    auto computePoolRow(u32 key) -> u32;
+    auto block(u64 vaddr, u32 address, JITContext ctx) -> Block*;
 
-    auto emit(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto emit(u64 vaddr, u32 address, JITContext ctx) -> Block*;
     auto emitZeroClear(u32 n) -> void;
     auto emitEXECUTE(u32 instruction) -> bool;
     auto emitSPECIAL(u32 instruction) -> bool;
diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp
index 68a6962243..a7a3bde363 100644
--- a/ares/n64/cpu/recompiler.cpp
+++ b/ares/n64/cpu/recompiler.cpp
@@ -9,10 +9,41 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* {
   return pool;
 }
 
-auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
-  if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block;
-  auto block = emit(vaddr, address, singleInstruction);
-  pool(address)->blocks[address >> 2 & 0x3f] = block;
+auto CPU::Recompiler::computePoolKey(u32 address, JITContext ctx) -> u32 {
+  u32 key = address >> 2 & 0x3f;
+  key |= ctx.singleInstruction ? 1 << 6 : 0;
+  key |= ctx.endian ? 1 << 7 : 0;
+  key |= (ctx.mode & 0x03) << 9;
+  key |= ctx.cop1Enabled ? 1 << 10 : 0;
+  key |= ctx.floatingPointMode ? 1 << 11 : 0;
+  key |= ctx.is64bit ? 1 << 12 : 0;
+  return key;
+}
+
+auto CPU::Recompiler::computePoolRow(u32 key) -> u32 {
+  // Jon Maiga's 'xmx' mixer, see https://jonkagstrom.com/bit-mixer-construction/
+  u64 x = key;
+  x ^= x >> 23;
+  x *= 0xff51afd7ed558ccdull;
+  x ^= x >> 23;
+  u32 row = x & 0x3f;
+  assert(row < sizeof(Pool::rows)/sizeof(Pool::rows[0]));
+  return row;
+}
+
+auto CPU::Recompiler::block(u64 vaddr, u32 address, JITContext ctx) -> Block* {
+  u32 key = computePoolKey(address, ctx);
+  u32 row = computePoolRow(key);
+
+  if (pool(address)->rows[row].tag == key) {
+    if (auto block = pool(address)->rows[row].block) {
+      return block;
+    }
+  }
+
+  memory::jitprotect(false);
+  auto block = emit(vaddr, address, ctx);
+  pool(address)->rows[row] = {.block = block, .tag = key};
   memory::jitprotect(true);
   return block;
 }
@@ -21,7 +52,7 @@ auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> B
 #define IpuReg(r)      sreg(1), offsetof(IPU, r) - IpuBase
 #define PipelineReg(x) mem(sreg(0), offsetof(CPU, pipeline) + offsetof(Pipeline, x))
 
-auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
+auto CPU::Recompiler::emit(u64 vaddr, u32 address, JITContext ctx) -> Block* {
   if(unlikely(allocator.available() < 1_MiB)) {
     print("CPU allocator flush\n");
     allocator.release();
@@ -60,7 +91,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
     vaddr += 4;
     address += 4;
     jumpToSelf += 4;
-    if(hasBranched || (address & 0xfc) == 0 || singleInstruction) break;  //block boundary
+    if(hasBranched || (address & 0xfc) == 0 || ctx.singleInstruction) break;  //block boundary
     hasBranched = branched;
     jumpEpilog(flag_nz);
   }