From a2e31d9cce69538a6e974e2c0ab0f4bbe3f060a7 Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Mon, 10 Jun 2024 13:30:32 -0300 Subject: [PATCH 1/7] Start implementing interop readback again --- src/ast.rs | 6 ++-- src/hvm.c | 15 ++++++--- src/hvm.rs | 4 +++ src/interop.rs | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/main.rs | 11 ++++-- 6 files changed, 117 insertions(+), 11 deletions(-) create mode 100644 src/interop.rs diff --git a/src/ast.rs b/src/ast.rs index 2d6c0370..584e9eec 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1,6 +1,6 @@ use TSPL::{new_parser, Parser}; use highlight_error::highlight_error; -use crate::hvm; +use crate::{hvm, interop}; use std::{collections::BTreeMap, fmt::{Debug, Display}}; // Types @@ -363,7 +363,7 @@ impl Book { // -------- impl Tree { - pub fn readback(net: &hvm::GNet, port: hvm::Port, fids: &BTreeMap) -> Option { + pub fn readback(net: &N, port: hvm::Port, fids: &BTreeMap) -> Option { //println!("reading {}", port.show()); match port.get_tag() { hvm::VAR => { @@ -415,7 +415,7 @@ impl Tree { } impl Net { - pub fn readback(net: &hvm::GNet, book: &hvm::Book) -> Option { + pub fn readback(net: &N, book: &hvm::Book) -> Option { let mut fids = BTreeMap::new(); for (fid, def) in book.defs.iter().enumerate() { fids.insert(fid as hvm::Val, def.name.clone()); diff --git a/src/hvm.c b/src/hvm.c index 786de2d2..47274f69 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1757,8 +1757,7 @@ void do_run_io(Net* net, Book* book, Port port); // Main // ---- - -void hvm_c(u32* book_buffer) { +void hvm_c(u32* book_buffer, Net* net_buffer) { // Creates static TMs alloc_static_tms(); @@ -1773,7 +1772,13 @@ void hvm_c(u32* book_buffer) { u64 start = time64(); // GMem - Net *net = malloc(sizeof(Net)); + Net *net; + if (net_buffer) { + // A buffer for the net has been allocated externally + net = net_buffer; + } else { + net = malloc(sizeof(Net)); + } net_init(net); // Creates an initial redex that calls main @@ -1801,13 +1806,13 @@ void hvm_c(u32* book_buffer) { // Frees everything free_static_tms(); - free(net); + if (!net_buffer) { free(net); } free(book); } #ifdef WITH_MAIN int main() { - hvm_c((u32*)BOOK_BUF); + hvm_c((u32*)BOOK_BUF, NULL); return 0; } #endif diff --git a/src/hvm.rs b/src/hvm.rs index d7007aad..941cf8d7 100644 --- a/src/hvm.rs +++ b/src/hvm.rs @@ -12,15 +12,19 @@ pub type Val = u32; // Val ::= 29-bit (rounded up to u32) pub type Rule = u8; // Rule ::= 8-bit (fits a u8) // Port +#[repr(C)] #[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] pub struct Port(pub Val); // Pair +#[repr(C)] pub struct Pair(pub u64); // Atomics pub type AVal = AtomicU32; +#[repr(C)] pub struct APort(pub AVal); +#[repr(C)] pub struct APair(pub AtomicU64); // Number diff --git a/src/interop.rs b/src/interop.rs new file mode 100644 index 00000000..d75d3e3b --- /dev/null +++ b/src/interop.rs @@ -0,0 +1,91 @@ +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; + +use crate::hvm::*; + +// Abstract Global Net +// Allows any global net to be read back +// ------------- + +pub trait NetReadback { + fn enter(&self, var: Port) -> Port; + fn node_load(&self, loc: usize) -> Pair; +} + +impl<'a> NetReadback for GNet<'a> { + fn enter(&self, var: Port) -> Port { self.enter(var) } + fn node_load(&self, loc: usize) -> Pair { self.node_load(loc) } +} + +// Global Net equivalent to the C implementation. +// NOTE: If the C struct `Net` changes, this has to change as well. +// TODO: use `bindgen` crate (https://github.com/rust-lang/rust-bindgen) to generate C structs +// ------------- + +#[repr(C)] +pub struct NetC { + pub node: [APair; NetC::G_NODE_LEN], // global node buffer + pub vars: [APort; NetC::G_VARS_LEN], // global vars buffer + pub rbag: [APair; NetC::G_RBAG_LEN], // global rbag buffer + pub itrs: AtomicU64, // interaction count + pub idle: AtomicU32, // idle thread counter +} + +impl NetC { + // Constants relevant in the C implementation + // NOTE: If any of these constants are changed in C, they have to be changed here as well. + pub const TPC_L2: usize = 3; + pub const TPC: usize = 1 << NetC::TPC_L2; + pub const CACHE_PAD: usize = 64; // Cache padding + + pub const HLEN: usize = 1 << 16; // max 16k high-priority redexes + pub const RLEN: usize = 1 << 24; // max 16m low-priority redexes + pub const G_NODE_LEN: usize = 1 << 29; // max 536m nodes + pub const G_VARS_LEN: usize = 1 << 29; // max 536m vars + pub const G_RBAG_LEN: usize = NetC::TPC * NetC::RLEN; + + pub fn vars_exchange(&self, var: usize, val: Port) -> Port { + Port(self.vars[var].0.swap(val.0, Ordering::Relaxed) as u32) + } + + pub fn vars_take(&self, var: usize) -> Port { + self.vars_exchange(var, Port(0)) + } + + fn node_load(&self, loc:usize) -> Pair { + Pair(self.node[loc].0.load(Ordering::Relaxed)) + } + + fn enter(&self, mut var: Port) -> Port { + // While `B` is VAR: extend it (as an optimization) + while var.get_tag() == VAR { + // Takes the current `B` substitution as `B'` + let val = self.vars_exchange(var.get_val() as usize, NONE); + // If there was no `B'`, stop, as there is no extension + if val == NONE || val == Port(0) { + break; + } + // Otherwise, delete `B` (we own both) and continue as `A ~> B'` + self.vars_take(var.get_val() as usize); + var = val; + } + return var; + } +} + +impl NetReadback for NetC { + fn node_load(&self, loc:usize) -> Pair { + self.node_load(loc) + } + + fn enter(&self, var: Port) -> Port { + self.enter(var) + } +} + +// Global Net equivalent to the CUDA implementation. +// NOTE: If the CUDA struct `Net` changes, this has to change as well. +// ------------- + +// TODO +// Problem: CUDA's `GNet` is allocated using `cudaMalloc` +// Solution: Write a CUDA kernel to compact GPU memory and then `memcpy` it to RAM diff --git a/src/lib.rs b/src/lib.rs index 07bd255a..c4f02d71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,3 +5,4 @@ pub mod ast; pub mod cmp; pub mod hvm; +pub mod interop; diff --git a/src/main.rs b/src/main.rs index dee219ab..57391a77 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,15 +3,16 @@ #![allow(unused_variables)] use clap::{Arg, ArgAction, Command}; -use ::hvm::{ast, cmp, hvm}; +use ::hvm::{ast, cmp, hvm, interop}; use std::fs; +use std::alloc; use std::io::Write; use std::path::PathBuf; use std::process::Command as SysCommand; #[cfg(feature = "c")] extern "C" { - fn hvm_c(book_buffer: *const u32); + fn hvm_c(book_buffer: *const u32, net_buffer: *const interop::NetC); } #[cfg(feature = "cuda")] @@ -79,7 +80,11 @@ fn main() { book.to_buffer(&mut data); #[cfg(feature = "c")] unsafe { - hvm_c(data.as_mut_ptr() as *mut u32); + let layout = alloc::Layout::new::(); + let net_ptr = alloc::alloc(layout) as *mut interop::NetC; + hvm_c(data.as_mut_ptr() as *mut u32, net_ptr); + // hvm_c(data.as_mut_ptr() as *mut u32, std::ptr::null()); + alloc::dealloc(net_ptr as *mut u8, layout); } #[cfg(not(feature = "c"))] println!("C runtime not available!\n"); From cbd9cc6767c9e8c44d2271cb8d57cb2c66db1f5a Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Mon, 3 Jun 2024 10:51:49 -0300 Subject: [PATCH 2/7] Abstract implementation running --- src/hvm.c | 22 +++++---- src/interop.rs | 68 +++++++++++++++++++++++--- src/main.rs | 126 ++++++++++++++++++++++++++++++++++++------------- 3 files changed, 168 insertions(+), 48 deletions(-) diff --git a/src/hvm.c b/src/hvm.c index 47274f69..9ed56642 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1790,19 +1790,21 @@ void hvm_c(u32* book_buffer, Net* net_buffer) { normalize(net, book); #endif - // Prints the result - printf("Result: "); - pretty_print_port(net, book, enter(net, ROOT)); - printf("\n"); - // Stops the timer double duration = (time64() - start) / 1000000000.0; // seconds - // Prints interactions and time - u64 itrs = atomic_load(&net->itrs); - printf("- ITRS: %" PRIu64 "\n", itrs); - printf("- TIME: %.2fs\n", duration); - printf("- MIPS: %.2f\n", (double)itrs / duration / 1000000.0); + if (!net_buffer) { + // Prints the result + printf("Result: "); + pretty_print_port(net, book, enter(net, ROOT)); + printf("\n"); + + // Prints interactions and time + u64 itrs = atomic_load(&net->itrs); + printf("- ITRS: %" PRIu64 "\n", itrs); + printf("- TIME: %.2fs\n", duration); + printf("- MIPS: %.2f\n", (double)itrs / duration / 1000000.0); + } // Frees everything free_static_tms(); diff --git a/src/interop.rs b/src/interop.rs index d75d3e3b..543f3bea 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -1,19 +1,52 @@ +use std::alloc; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use crate::hvm::*; +#[cfg(feature = "c")] +extern "C" { + fn hvm_c(book_buffer: *const u32, net_buffer: *const NetC, run_io: bool); +} + +#[cfg(feature = "cuda")] +extern "C" { + fn hvm_cu(book_buffer: *const u32, run_io: bool); +} + // Abstract Global Net // Allows any global net to be read back // ------------- pub trait NetReadback { + fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()); fn enter(&self, var: Port) -> Port; fn node_load(&self, loc: usize) -> Pair; + fn itrs(&self) -> u64; } impl<'a> NetReadback for GNet<'a> { + fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()) { + // Initializes the global net + let net = GNet::new(1 << 29, 1 << 29); + + // Initializes threads + let mut tm = TMem::new(0, 1); + + // Creates an initial redex that calls main + let main_id = book.defs.iter().position(|def| def.name == "main").unwrap(); + tm.rbag.push_redex(Pair::new(Port::new(REF, main_id as u32), ROOT)); + net.vars_create(ROOT.get_val() as usize, NONE); + + let initial_state = before(); + + // Evaluates + tm.evaluator(&net, &book); + + after(&net, book, initial_state); + } fn enter(&self, var: Port) -> Port { self.enter(var) } fn node_load(&self, loc: usize) -> Pair { self.node_load(loc) } + fn itrs(&self) -> u64 { self.itrs.load(Ordering::Relaxed) } } // Global Net equivalent to the C implementation. @@ -21,6 +54,7 @@ impl<'a> NetReadback for GNet<'a> { // TODO: use `bindgen` crate (https://github.com/rust-lang/rust-bindgen) to generate C structs // ------------- +#[cfg(feature = "c")] #[repr(C)] pub struct NetC { pub node: [APair; NetC::G_NODE_LEN], // global node buffer @@ -30,6 +64,7 @@ pub struct NetC { pub idle: AtomicU32, // idle thread counter } +#[cfg(feature = "c")] impl NetC { // Constants relevant in the C implementation // NOTE: If any of these constants are changed in C, they have to be changed here as well. @@ -72,14 +107,35 @@ impl NetC { } } +#[cfg(feature = "c")] impl NetReadback for NetC { - fn node_load(&self, loc:usize) -> Pair { - self.node_load(loc) + fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()) { + // Serialize book + let mut data : Vec = Vec::new(); + book.to_buffer(&mut data); + //println!("{:?}", data); + let book_buffer = data.as_mut_ptr() as *mut u32; + + let layout = alloc::Layout::new::(); + let net_ptr = unsafe { alloc::alloc(layout) as *mut NetC }; + + let initial_state = before(); + + unsafe { + hvm_c(data.as_mut_ptr() as *mut u32, net_ptr, true); + } + + // Converts the raw pointer to a reference + let net_ref = unsafe { &mut *net_ptr }; + + after(net_ref, book, initial_state); + + // Deallocate network's memory + unsafe { alloc::dealloc(net_ptr as *mut u8, layout) }; } - - fn enter(&self, var: Port) -> Port { - self.enter(var) - } + fn node_load(&self, loc:usize) -> Pair { self.node_load(loc) } + fn enter(&self, var: Port) -> Port { self.enter(var) } + fn itrs(&self) -> u64 { self.itrs.load(Ordering::Relaxed) } } // Global Net equivalent to the CUDA implementation. diff --git a/src/main.rs b/src/main.rs index 57391a77..0ad4c1c3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,11 +3,12 @@ #![allow(unused_variables)] use clap::{Arg, ArgAction, Command}; -use ::hvm::{ast, cmp, hvm, interop}; +use ::hvm::{ast, cmp, hvm, interop, interop::NetReadback}; use std::fs; use std::alloc; use std::io::Write; use std::path::PathBuf; +use std::time::Instant; use std::process::Command as SysCommand; #[cfg(feature = "c")] @@ -70,22 +71,16 @@ fn main() { let file = sub_matches.get_one::("file").expect("required"); let code = fs::read_to_string(file).expect("Unable to read file"); let book = ast::Book::parse(&code).unwrap_or_else(|er| panic!("{}",er)).build(); - run(&book); + hvm::GNet::run(&book, before_running, after_running); + // run(&book); } Some(("run-c", sub_matches)) => { let file = sub_matches.get_one::("file").expect("required"); let code = fs::read_to_string(file).expect("Unable to read file"); let book = ast::Book::parse(&code).unwrap_or_else(|er| panic!("{}",er)).build(); - let mut data : Vec = Vec::new(); - book.to_buffer(&mut data); + let run_io = sub_matches.get_flag("io"); #[cfg(feature = "c")] - unsafe { - let layout = alloc::Layout::new::(); - let net_ptr = alloc::alloc(layout) as *mut interop::NetC; - hvm_c(data.as_mut_ptr() as *mut u32, net_ptr); - // hvm_c(data.as_mut_ptr() as *mut u32, std::ptr::null()); - alloc::dealloc(net_ptr as *mut u8, layout); - } + interop::NetC::run(&book, before_running, after_running); #[cfg(not(feature = "c"))] println!("C runtime not available!\n"); } @@ -162,40 +157,107 @@ fn main() { } } -pub fn run(book: &hvm::Book) { - // Initializes the global net - let net = hvm::GNet::new(1 << 29, 1 << 29); - - // Initializes threads - let mut tm = hvm::TMem::new(0, 1); - - // Creates an initial redex that calls main - let main_id = book.defs.iter().position(|def| def.name == "main").unwrap(); - tm.rbag.push_redex(hvm::Pair::new(hvm::Port::new(hvm::REF, main_id as u32), hvm::ROOT)); - net.vars_create(hvm::ROOT.get_val() as usize, hvm::NONE); - - // Starts the timer - let start = std::time::Instant::now(); +pub fn before_running() -> Instant { + Instant::now() +} - // Evaluates - tm.evaluator(&net, &book); - +pub fn after_running(net: &impl interop::NetReadback, book: &hvm::Book, timer: Instant) { // Stops the timer - let duration = start.elapsed(); + let duration = timer.elapsed(); //println!("{}", net.show()); // Prints the result - if let Some(tree) = ast::Net::readback(&net, book) { + if let Some(tree) = ast::Net::readback(net, book) { println!("Result: {}", tree.show()); } else { println!("Readback failed. Printing GNet memdump...\n"); - println!("{}", net.show()); + // println!("{}", net.show()); } // Prints interactions and time - let itrs = net.itrs.load(std::sync::atomic::Ordering::Relaxed); + let itrs = net.itrs(); println!("- ITRS: {}", itrs); println!("- TIME: {:.2}s", duration.as_secs_f64()); println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); } + +// pub fn run(book: &hvm::Book) { +// // Initializes the global net +// let net = hvm::GNet::new(1 << 29, 1 << 29); + +// // Initializes threads +// let mut tm = hvm::TMem::new(0, 1); + +// // Creates an initial redex that calls main +// let main_id = book.defs.iter().position(|def| def.name == "main").unwrap(); +// tm.rbag.push_redex(hvm::Pair::new(hvm::Port::new(hvm::REF, main_id as u32), hvm::ROOT)); +// net.vars_create(hvm::ROOT.get_val() as usize, hvm::NONE); + +// // Starts the timer +// let start = std::time::Instant::now(); + +// // Evaluates +// tm.evaluator(&net, &book); + +// // Stops the timer +// let duration = start.elapsed(); + +// //println!("{}", net.show()); + +// // Prints the result +// if let Some(tree) = ast::Net::readback(&net, book) { +// println!("Result: {}", tree.show()); +// } else { +// println!("Readback failed. Printing GNet memdump...\n"); +// println!("{}", net.show()); +// } + +// // Prints interactions and time +// let itrs = net.itrs.load(std::sync::atomic::Ordering::Relaxed); +// println!("- ITRS: {}", itrs); +// println!("- TIME: {:.2}s", duration.as_secs_f64()); +// println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); +// } + +// #[cfg(feature = "c")] +// pub fn run_c(book: &hvm::Book) { +// // Serialize book +// let mut data : Vec = Vec::new(); +// book.to_buffer(&mut data); +// //println!("{:?}", data); +// let book_buffer = data.as_mut_ptr() as *mut u32; + +// let layout = alloc::Layout::new::(); +// let net_ptr = unsafe { alloc::alloc(layout) as *mut interop::NetC }; + +// // Starts the timer +// let start = std::time::Instant::now(); + +// unsafe { +// hvm_c(data.as_mut_ptr() as *mut u32, net_ptr, true); +// } + +// // Stops the timer +// let duration = start.elapsed(); + +// // Converts the raw pointer to a reference +// let net_ref = unsafe { &mut *net_ptr }; + +// // Prints the result +// if let Some(tree) = ast::Net::readback(net_ref, book) { +// println!("Result: {}", tree.show()); +// } else { +// println!("Readback failed. Can't print GNet memdump from C.\n"); +// // println!("{}", net_ref.show()); +// } + +// // Prints interactions and time +// let itrs = net_ref.itrs.load(std::sync::atomic::Ordering::Relaxed); +// println!("- ITRS: {}", itrs); +// println!("- TIME: {:.2}s", duration.as_secs_f64()); +// println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); + +// // Deallocate network's memory +// unsafe { alloc::dealloc(net_ptr as *mut u8, layout) }; +// } From c0a79aeb1803516cc29a658b82037f9bd0523c0e Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Mon, 3 Jun 2024 13:37:51 -0300 Subject: [PATCH 3/7] Fix IO tests The result of the net reduction was appearing before the IO action writes to stdin. This is because the C implementation wasn't flushing it's `stdin` and `stderr` while the Rust one was. --- src/hvm.c | 4 ++++ src/main.rs | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hvm.c b/src/hvm.c index 9ed56642..28bc0b44 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1786,6 +1786,10 @@ void hvm_c(u32* book_buffer, Net* net_buffer) { #ifdef IO do_run_io(net, book, ROOT); + // IO actions into `stdout` and `stderr` may appear + // after Rust `print`s if we don't flush + fflush(stdout); + fflush(stderr); #else normalize(net, book); #endif diff --git a/src/main.rs b/src/main.rs index 0ad4c1c3..06734489 100644 --- a/src/main.rs +++ b/src/main.rs @@ -72,7 +72,6 @@ fn main() { let code = fs::read_to_string(file).expect("Unable to read file"); let book = ast::Book::parse(&code).unwrap_or_else(|er| panic!("{}",er)).build(); hvm::GNet::run(&book, before_running, after_running); - // run(&book); } Some(("run-c", sub_matches)) => { let file = sub_matches.get_one::("file").expect("required"); From af49461d50c370a74202fdd531c2df8ea430f897 Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Tue, 11 Jun 2024 11:11:26 -0300 Subject: [PATCH 4/7] Improve net API --- src/interop.rs | 92 +++++++++++++++++++++++++++-------------------- src/main.rs | 97 +++++--------------------------------------------- 2 files changed, 62 insertions(+), 127 deletions(-) diff --git a/src/interop.rs b/src/interop.rs index 543f3bea..e9761e81 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -5,12 +5,12 @@ use crate::hvm::*; #[cfg(feature = "c")] extern "C" { - fn hvm_c(book_buffer: *const u32, net_buffer: *const NetC, run_io: bool); + fn hvm_c(book_buffer: *const u32, net_buffer: *const RawNetC); } #[cfg(feature = "cuda")] extern "C" { - fn hvm_cu(book_buffer: *const u32, run_io: bool); + fn hvm_cu(book_buffer: *const u32); } // Abstract Global Net @@ -18,14 +18,14 @@ extern "C" { // ------------- pub trait NetReadback { - fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()); + fn run(book: &Book) -> Self; fn enter(&self, var: Port) -> Port; fn node_load(&self, loc: usize) -> Pair; fn itrs(&self) -> u64; } impl<'a> NetReadback for GNet<'a> { - fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()) { + fn run(book: &Book) -> Self { // Initializes the global net let net = GNet::new(1 << 29, 1 << 29); @@ -37,12 +37,10 @@ impl<'a> NetReadback for GNet<'a> { tm.rbag.push_redex(Pair::new(Port::new(REF, main_id as u32), ROOT)); net.vars_create(ROOT.get_val() as usize, NONE); - let initial_state = before(); - // Evaluates tm.evaluator(&net, &book); - after(&net, book, initial_state); + net } fn enter(&self, var: Port) -> Port { self.enter(var) } fn node_load(&self, loc: usize) -> Pair { self.node_load(loc) } @@ -57,6 +55,12 @@ impl<'a> NetReadback for GNet<'a> { #[cfg(feature = "c")] #[repr(C)] pub struct NetC { + raw: *mut RawNetC +} + +#[cfg(feature = "c")] +#[repr(C)] +pub struct RawNetC { pub node: [APair; NetC::G_NODE_LEN], // global node buffer pub vars: [APort; NetC::G_VARS_LEN], // global vars buffer pub rbag: [APair; NetC::G_RBAG_LEN], // global rbag buffer @@ -78,16 +82,53 @@ impl NetC { pub const G_VARS_LEN: usize = 1 << 29; // max 536m vars pub const G_RBAG_LEN: usize = NetC::TPC * NetC::RLEN; - pub fn vars_exchange(&self, var: usize, val: Port) -> Port { - Port(self.vars[var].0.swap(val.0, Ordering::Relaxed) as u32) + pub fn net(&self) -> &RawNetC { + unsafe { &*self.raw } + } + + pub fn net_mut(&mut self) -> &mut RawNetC { + unsafe { &mut *self.raw } + } + + fn vars_exchange(&self, var: usize, val: Port) -> Port { + Port(self.net().vars[var].0.swap(val.0, Ordering::Relaxed) as u32) } pub fn vars_take(&self, var: usize) -> Port { self.vars_exchange(var, Port(0)) } +} + +#[cfg(feature = "c")] +impl Drop for NetC { + fn drop(&mut self) { + // Deallocate network's memory + let layout = alloc::Layout::new::(); + unsafe { alloc::dealloc(self.raw as *mut u8, layout) }; + } +} + +#[cfg(feature = "c")] +impl NetReadback for NetC { + fn run(book: &Book) -> Self { + // Serialize book + let mut data : Vec = Vec::new(); + book.to_buffer(&mut data); + //println!("{:?}", data); + let book_buffer = data.as_mut_ptr() as *mut u32; + + let layout = alloc::Layout::new::(); + let net_ptr = unsafe { alloc::alloc(layout) as *mut RawNetC }; + + unsafe { + hvm_c(data.as_mut_ptr() as *mut u32, net_ptr); + } + + NetC { raw: net_ptr } + } fn node_load(&self, loc:usize) -> Pair { - Pair(self.node[loc].0.load(Ordering::Relaxed)) + Pair(self.net().node[loc].0.load(Ordering::Relaxed)) } fn enter(&self, mut var: Port) -> Port { @@ -105,37 +146,10 @@ impl NetC { } return var; } -} -#[cfg(feature = "c")] -impl NetReadback for NetC { - fn run(book: &Book, before: impl FnOnce() -> T, after: impl FnOnce(&Self, &Book, T) -> ()) { - // Serialize book - let mut data : Vec = Vec::new(); - book.to_buffer(&mut data); - //println!("{:?}", data); - let book_buffer = data.as_mut_ptr() as *mut u32; - - let layout = alloc::Layout::new::(); - let net_ptr = unsafe { alloc::alloc(layout) as *mut NetC }; - - let initial_state = before(); - - unsafe { - hvm_c(data.as_mut_ptr() as *mut u32, net_ptr, true); - } - - // Converts the raw pointer to a reference - let net_ref = unsafe { &mut *net_ptr }; - - after(net_ref, book, initial_state); - - // Deallocate network's memory - unsafe { alloc::dealloc(net_ptr as *mut u8, layout) }; + fn itrs(&self) -> u64 { + self.net().itrs.load(Ordering::Relaxed) } - fn node_load(&self, loc:usize) -> Pair { self.node_load(loc) } - fn enter(&self, var: Port) -> Port { self.enter(var) } - fn itrs(&self) -> u64 { self.itrs.load(Ordering::Relaxed) } } // Global Net equivalent to the CUDA implementation. diff --git a/src/main.rs b/src/main.rs index 06734489..d28cea2a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -71,15 +71,14 @@ fn main() { let file = sub_matches.get_one::("file").expect("required"); let code = fs::read_to_string(file).expect("Unable to read file"); let book = ast::Book::parse(&code).unwrap_or_else(|er| panic!("{}",er)).build(); - hvm::GNet::run(&book, before_running, after_running); + run::(&book); } Some(("run-c", sub_matches)) => { let file = sub_matches.get_one::("file").expect("required"); let code = fs::read_to_string(file).expect("Unable to read file"); let book = ast::Book::parse(&code).unwrap_or_else(|er| panic!("{}",er)).build(); - let run_io = sub_matches.get_flag("io"); #[cfg(feature = "c")] - interop::NetC::run(&book, before_running, after_running); + run::(&book); #[cfg(not(feature = "c"))] println!("C runtime not available!\n"); } @@ -156,18 +155,20 @@ fn main() { } } -pub fn before_running() -> Instant { - Instant::now() -} +pub fn run(book: &hvm::Book) { + // Start timer + let timer = Instant::now(); -pub fn after_running(net: &impl interop::NetReadback, book: &hvm::Book, timer: Instant) { + // Run net + let net = N::run(book); + // Stops the timer let duration = timer.elapsed(); //println!("{}", net.show()); // Prints the result - if let Some(tree) = ast::Net::readback(net, book) { + if let Some(tree) = ast::Net::readback(&net, book) { println!("Result: {}", tree.show()); } else { println!("Readback failed. Printing GNet memdump...\n"); @@ -180,83 +181,3 @@ pub fn after_running(net: &impl interop::NetReadback, book: &hvm::Book, timer: I println!("- TIME: {:.2}s", duration.as_secs_f64()); println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); } - -// pub fn run(book: &hvm::Book) { -// // Initializes the global net -// let net = hvm::GNet::new(1 << 29, 1 << 29); - -// // Initializes threads -// let mut tm = hvm::TMem::new(0, 1); - -// // Creates an initial redex that calls main -// let main_id = book.defs.iter().position(|def| def.name == "main").unwrap(); -// tm.rbag.push_redex(hvm::Pair::new(hvm::Port::new(hvm::REF, main_id as u32), hvm::ROOT)); -// net.vars_create(hvm::ROOT.get_val() as usize, hvm::NONE); - -// // Starts the timer -// let start = std::time::Instant::now(); - -// // Evaluates -// tm.evaluator(&net, &book); - -// // Stops the timer -// let duration = start.elapsed(); - -// //println!("{}", net.show()); - -// // Prints the result -// if let Some(tree) = ast::Net::readback(&net, book) { -// println!("Result: {}", tree.show()); -// } else { -// println!("Readback failed. Printing GNet memdump...\n"); -// println!("{}", net.show()); -// } - -// // Prints interactions and time -// let itrs = net.itrs.load(std::sync::atomic::Ordering::Relaxed); -// println!("- ITRS: {}", itrs); -// println!("- TIME: {:.2}s", duration.as_secs_f64()); -// println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); -// } - -// #[cfg(feature = "c")] -// pub fn run_c(book: &hvm::Book) { -// // Serialize book -// let mut data : Vec = Vec::new(); -// book.to_buffer(&mut data); -// //println!("{:?}", data); -// let book_buffer = data.as_mut_ptr() as *mut u32; - -// let layout = alloc::Layout::new::(); -// let net_ptr = unsafe { alloc::alloc(layout) as *mut interop::NetC }; - -// // Starts the timer -// let start = std::time::Instant::now(); - -// unsafe { -// hvm_c(data.as_mut_ptr() as *mut u32, net_ptr, true); -// } - -// // Stops the timer -// let duration = start.elapsed(); - -// // Converts the raw pointer to a reference -// let net_ref = unsafe { &mut *net_ptr }; - -// // Prints the result -// if let Some(tree) = ast::Net::readback(net_ref, book) { -// println!("Result: {}", tree.show()); -// } else { -// println!("Readback failed. Can't print GNet memdump from C.\n"); -// // println!("{}", net_ref.show()); -// } - -// // Prints interactions and time -// let itrs = net_ref.itrs.load(std::sync::atomic::Ordering::Relaxed); -// println!("- ITRS: {}", itrs); -// println!("- TIME: {:.2}s", duration.as_secs_f64()); -// println!("- MIPS: {:.2}", itrs as f64 / duration.as_secs_f64() / 1_000_000.0); - -// // Deallocate network's memory -// unsafe { alloc::dealloc(net_ptr as *mut u8, layout) }; -// } From 688d6afcf2b8fabedf7457049a1371e27270c11b Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Wed, 12 Jun 2024 11:35:32 -0300 Subject: [PATCH 5/7] Add initial interop with CUDA (NOTE: not working!) Some tests are returning a segmentation fault. Still have to investigate. --- src/hvm.cu | 35 ++++++++++++++++++++----- src/interop.rs | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 14 ++-------- 3 files changed, 99 insertions(+), 21 deletions(-) diff --git a/src/hvm.cu b/src/hvm.cu index 2b740ef8..a2e5e464 100644 --- a/src/hvm.cu +++ b/src/hvm.cu @@ -2265,6 +2265,16 @@ __global__ void print_result(GNet* gnet) { } } +__global__ void compact(GNet* gnet, Pair* node_out, Port* vars_out) { + // Ideia: percorrer os nós de forma semelhante a `pretty_print_port`, + // colocando eles nos buffers em questão usando uma stack. + + // ??? + // if (threadIdx.x == 0 && blockIdx.x == 0) { + // Port r = vars_take(gnet, ROOT); + // } +} + // Demos // ----- @@ -2286,7 +2296,7 @@ __global__ void print_result(GNet* gnet) { void do_run_io(GNet* gnet, Book* book, Port port); #endif -extern "C" void hvm_cu(u32* book_buffer) { +extern "C" void hvm_cu(u32* book_buffer, GNet* output) { // Start the timer clock_t start = clock(); @@ -2319,7 +2329,10 @@ extern "C" void hvm_cu(u32* book_buffer) { double duration = ((double)(end - start)) / CLOCKS_PER_SEC; // Prints the result - print_result<<<1,1>>>(gnet); + // If `output` is set, the Rust implementation will print the net + if (!output) { + print_result<<<1,1>>>(gnet); + } // Reports errors cudaError_t err = cudaGetLastError(); @@ -2331,6 +2344,11 @@ extern "C" void hvm_cu(u32* book_buffer) { exit(EXIT_FAILURE); } + // If `output` is set, copy the memory from the net into the Rust implementation + if (output) { + cudaMemcpy(output, gnet, sizeof(GNet), cudaMemcpyDeviceToHost); + } + // Prints entire memdump //{ //// Allocate host memory for the net @@ -2355,15 +2373,18 @@ extern "C" void hvm_cu(u32* book_buffer) { //cudaMemcpy(&itrs, &gnet->itrs, sizeof(u64), cudaMemcpyDeviceToHost); // Prints interactions, time and MIPS - printf("- ITRS: %llu\n", gnet_get_itrs(gnet)); - printf("- LEAK: %llu\n", gnet_get_leak(gnet)); - printf("- TIME: %.2fs\n", duration); - printf("- MIPS: %.2f\n", (double)gnet_get_itrs(gnet) / duration / 1000000.0); + // If `output` is set, the Rust implementation will print the net + if (!output) { + printf("- ITRS: %llu\n", gnet_get_itrs(gnet)); + printf("- LEAK: %llu\n", gnet_get_leak(gnet)); + printf("- TIME: %.2fs\n", duration); + printf("- MIPS: %.2f\n", (double)gnet_get_itrs(gnet) / duration / 1000000.0); + } } #ifdef WITH_MAIN int main() { - hvm_cu((u32*)BOOK_BUF); + hvm_cu((u32*)BOOK_BUF, NULL); return 0; } #endif diff --git a/src/interop.rs b/src/interop.rs index e9761e81..2e3cca14 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -5,12 +5,12 @@ use crate::hvm::*; #[cfg(feature = "c")] extern "C" { - fn hvm_c(book_buffer: *const u32, net_buffer: *const RawNetC); + pub fn hvm_c(book_buffer: *const u32, net_buffer: *const RawNetC); } #[cfg(feature = "cuda")] extern "C" { - fn hvm_cu(book_buffer: *const u32); + pub fn hvm_cu(book_buffer: *const u32, net_buffer: *const RawNetCuda); } // Abstract Global Net @@ -159,3 +159,70 @@ impl NetReadback for NetC { // TODO // Problem: CUDA's `GNet` is allocated using `cudaMalloc` // Solution: Write a CUDA kernel to compact GPU memory and then `memcpy` it to RAM + +// #[cfg(feature = "cuda")] +#[repr(C)] +pub struct NetCuda { + raw: *mut RawNetCuda +} + +// #[cfg(feature = "cuda")] +#[repr(C)] +pub struct RawNetCuda { + pub rbag_use_a: u32, // total rbag redex count (buffer A) + pub rbag_use_b: u32, // total rbag redex count (buffer B) + pub rbag_buf_a: [Pair; NetCuda::G_RBAG_LEN], // global redex bag (buffer A) + pub rbag_buf_b: [Pair; NetCuda::G_RBAG_LEN], // global redex bag (buffer B) + pub node_buf: [Pair; NetCuda::G_NODE_LEN], // global node buffer + pub vars_buf: [Port; NetCuda::G_VARS_LEN], // global vars buffer + pub node_put: [u32; NetCuda::TPB * NetCuda::BPG], + pub vars_put: [u32; NetCuda::TPB * NetCuda::BPG], + pub rbag_put: [u32; NetCuda::TPB * NetCuda::BPG], + pub mode: u8, // evaluation mode (curr) + pub itrs: u64, // interaction count + pub iadd: u64, // interaction count adder + pub leak: u64, // leak count + pub turn: u32, // turn count + pub down: u8, // are we recursing down? + pub rdec: u8, // decrease rpos by 1? +} + +// #[cfg(feature = "cuda")] +impl NetCuda { + // Constants relevant in the CUDA implementation + // NOTE: If any of these constants are changed in CUDA, they have to be changed here as well. + + // Threads per Block + pub const TPB_L2: usize = 7; + pub const TPB: usize = 1 << NetCuda::TPB_L2; + + // Blocks per GPU + pub const BPG_L2: usize = 7; + pub const BPG: usize = 1 << NetCuda::BPG_L2; + + // Thread Redex Bag Length + pub const RLEN: usize = 256; + + pub const G_NODE_LEN: usize = 1 << 29; // max 536m nodes + pub const G_VARS_LEN: usize = 1 << 29; // max 536m vars + pub const G_RBAG_LEN: usize = NetCuda::TPB * NetCuda::BPG * NetCuda::RLEN * 3; + + pub fn net(&self) -> &RawNetCuda { + unsafe { &*self.raw } + } + + pub fn net_mut(&mut self) -> &mut RawNetCuda { + unsafe { &mut *self.raw } + } + + fn vars_exchange(&mut self, var: usize, val: Port) -> Port { + let net = self.net_mut(); + let old = net.vars_buf[var]; + net.vars_buf[var] = val; + old + } + + pub fn vars_take(&mut self, var: usize) -> Port { + self.vars_exchange(var, Port(0)) + } +} diff --git a/src/main.rs b/src/main.rs index d28cea2a..5d61c95d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,16 +11,6 @@ use std::path::PathBuf; use std::time::Instant; use std::process::Command as SysCommand; -#[cfg(feature = "c")] -extern "C" { - fn hvm_c(book_buffer: *const u32, net_buffer: *const interop::NetC); -} - -#[cfg(feature = "cuda")] -extern "C" { - fn hvm_cu(book_buffer: *const u32); -} - fn main() { let matches = Command::new("hvm") .about("HVM2: Higher-order Virtual Machine 2 (32-bit Version)") @@ -90,7 +80,7 @@ fn main() { book.to_buffer(&mut data); #[cfg(feature = "cuda")] unsafe { - hvm_cu(data.as_mut_ptr() as *mut u32); + interop::hvm_cu(data.as_mut_ptr() as *mut u32, std::ptr::null()); } #[cfg(not(feature = "cuda"))] println!("CUDA runtime not available!\n If you've installed CUDA and nvcc after HVM, please reinstall HVM."); @@ -159,7 +149,7 @@ pub fn run(book: &hvm::Book) { // Start timer let timer = Instant::now(); - // Run net + // Normalize net let net = N::run(book); // Stops the timer From 4e14fce14915d32a9975cfb7edf1617242e4e81a Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Wed, 12 Jun 2024 19:20:25 -0300 Subject: [PATCH 6/7] Simplify C interop --- src/ast.rs | 4 +- src/hvm.c | 40 +++++++++++++++----- src/hvm.cu | 10 ++--- src/interop.rs | 101 ++++++++++++++++++++----------------------------- src/main.rs | 4 +- 5 files changed, 81 insertions(+), 78 deletions(-) diff --git a/src/ast.rs b/src/ast.rs index 17cf3324..862b530c 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -364,7 +364,7 @@ impl Book { // -------- impl Tree { - pub fn readback(net: &N, port: hvm::Port, fids: &BTreeMap) -> Option { + pub fn readback(net: &mut N, port: hvm::Port, fids: &BTreeMap) -> Option { //println!("reading {}", port.show()); match port.get_tag() { hvm::VAR => { @@ -416,7 +416,7 @@ impl Tree { } impl Net { - pub fn readback(net: &N, book: &hvm::Book) -> Option { + pub fn readback(net: &mut N, book: &hvm::Book) -> Option { let mut fids = BTreeMap::new(); for (fid, def) in book.defs.iter().enumerate() { fids.insert(fid as hvm::Val, def.name.clone()); diff --git a/src/hvm.c b/src/hvm.c index 28bc0b44..cd8c827b 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1755,9 +1755,24 @@ void pretty_print_port(Net* net, Book* book, Port port) { void do_run_io(Net* net, Book* book, Port port); #endif +// Output Net +// Used by other languages calling `hvm_c` +// ---- +typedef struct OutputNet { + void *original; + APair *node_buf; + APort *vars_buf; + a64 itrs; +} OutputNet; + +void free_output_net(OutputNet* net) { + free((Net*)net->original); + free(net); +} + // Main // ---- -void hvm_c(u32* book_buffer, Net* net_buffer) { +OutputNet* hvm_c(u32* book_buffer, bool return_output) { // Creates static TMs alloc_static_tms(); @@ -1772,13 +1787,7 @@ void hvm_c(u32* book_buffer, Net* net_buffer) { u64 start = time64(); // GMem - Net *net; - if (net_buffer) { - // A buffer for the net has been allocated externally - net = net_buffer; - } else { - net = malloc(sizeof(Net)); - } + Net *net = malloc(sizeof(Net)); net_init(net); // Creates an initial redex that calls main @@ -1797,7 +1806,7 @@ void hvm_c(u32* book_buffer, Net* net_buffer) { // Stops the timer double duration = (time64() - start) / 1000000000.0; // seconds - if (!net_buffer) { + if (!return_output) { // Prints the result printf("Result: "); pretty_print_port(net, book, enter(net, ROOT)); @@ -1812,8 +1821,19 @@ void hvm_c(u32* book_buffer, Net* net_buffer) { // Frees everything free_static_tms(); - if (!net_buffer) { free(net); } free(book); + + if (return_output) { + OutputNet *output = malloc(sizeof(OutputNet)); + output->original = (void*)net; + output->node_buf = &net->node_buf[0]; + output->vars_buf = &net->vars_buf[0]; + output->itrs = atomic_load(&net->itrs); + return output; + } + + free(net); + return NULL; } #ifdef WITH_MAIN diff --git a/src/hvm.cu b/src/hvm.cu index a2e5e464..7591cafb 100644 --- a/src/hvm.cu +++ b/src/hvm.cu @@ -2296,7 +2296,7 @@ __global__ void compact(GNet* gnet, Pair* node_out, Port* vars_out) { void do_run_io(GNet* gnet, Book* book, Port port); #endif -extern "C" void hvm_cu(u32* book_buffer, GNet* output) { +extern "C" void hvm_cu(u32* book_buffer, bool return_output) { // Start the timer clock_t start = clock(); @@ -2330,7 +2330,7 @@ extern "C" void hvm_cu(u32* book_buffer, GNet* output) { // Prints the result // If `output` is set, the Rust implementation will print the net - if (!output) { + if (!return_output) { print_result<<<1,1>>>(gnet); } @@ -2345,8 +2345,8 @@ extern "C" void hvm_cu(u32* book_buffer, GNet* output) { } // If `output` is set, copy the memory from the net into the Rust implementation - if (output) { - cudaMemcpy(output, gnet, sizeof(GNet), cudaMemcpyDeviceToHost); + if (return_output) { + // cudaMemcpy(output, gnet, sizeof(GNet), cudaMemcpyDeviceToHost); } // Prints entire memdump @@ -2374,7 +2374,7 @@ extern "C" void hvm_cu(u32* book_buffer, GNet* output) { // Prints interactions, time and MIPS // If `output` is set, the Rust implementation will print the net - if (!output) { + if (!return_output) { printf("- ITRS: %llu\n", gnet_get_itrs(gnet)); printf("- LEAK: %llu\n", gnet_get_leak(gnet)); printf("- TIME: %.2fs\n", duration); diff --git a/src/interop.rs b/src/interop.rs index 2e3cca14..a42d5b17 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -5,12 +5,13 @@ use crate::hvm::*; #[cfg(feature = "c")] extern "C" { - pub fn hvm_c(book_buffer: *const u32, net_buffer: *const RawNetC); + pub fn hvm_c(book_buffer: *const u32, return_output: u8) -> *mut OutputNetC; + pub fn free_output_net(net: *mut OutputNetC); } #[cfg(feature = "cuda")] extern "C" { - pub fn hvm_cu(book_buffer: *const u32, net_buffer: *const RawNetCuda); + pub fn hvm_cu(book_buffer: *const u32, return_output: bool); } // Abstract Global Net @@ -19,9 +20,29 @@ extern "C" { pub trait NetReadback { fn run(book: &Book) -> Self; - fn enter(&self, var: Port) -> Port; fn node_load(&self, loc: usize) -> Pair; + fn vars_exchange(&mut self, var: usize, val: Port) -> Port; fn itrs(&self) -> u64; + + fn vars_take(&mut self, var: usize) -> Port { + self.vars_exchange(var, Port(0)) + } + + fn enter(&mut self, mut var: Port) -> Port { + // While `B` is VAR: extend it (as an optimization) + while var.get_tag() == VAR { + // Takes the current `B` substitution as `B'` + let val = self.vars_exchange(var.get_val() as usize, NONE); + // If there was no `B'`, stop, as there is no extension + if val == NONE || val == Port(0) { + break; + } + // Otherwise, delete `B` (we own both) and continue as `A ~> B'` + self.vars_take(var.get_val() as usize); + var = val; + } + return var; + } } impl<'a> NetReadback for GNet<'a> { @@ -42,8 +63,8 @@ impl<'a> NetReadback for GNet<'a> { net } - fn enter(&self, var: Port) -> Port { self.enter(var) } fn node_load(&self, loc: usize) -> Pair { self.node_load(loc) } + fn vars_exchange(&mut self, var: usize, val: Port) -> Port { GNet::vars_exchange(self, var, val) } fn itrs(&self) -> u64 { self.itrs.load(Ordering::Relaxed) } } @@ -55,56 +76,34 @@ impl<'a> NetReadback for GNet<'a> { #[cfg(feature = "c")] #[repr(C)] pub struct NetC { - raw: *mut RawNetC + raw: *mut OutputNetC } #[cfg(feature = "c")] #[repr(C)] -pub struct RawNetC { - pub node: [APair; NetC::G_NODE_LEN], // global node buffer - pub vars: [APort; NetC::G_VARS_LEN], // global vars buffer - pub rbag: [APair; NetC::G_RBAG_LEN], // global rbag buffer +pub struct OutputNetC { + pub original: *mut std::ffi::c_void, + pub node_buf: *mut APair, // global node buffer + pub vars_buf: *mut APort, // global vars buffer pub itrs: AtomicU64, // interaction count - pub idle: AtomicU32, // idle thread counter } #[cfg(feature = "c")] impl NetC { - // Constants relevant in the C implementation - // NOTE: If any of these constants are changed in C, they have to be changed here as well. - pub const TPC_L2: usize = 3; - pub const TPC: usize = 1 << NetC::TPC_L2; - pub const CACHE_PAD: usize = 64; // Cache padding - - pub const HLEN: usize = 1 << 16; // max 16k high-priority redexes - pub const RLEN: usize = 1 << 24; // max 16m low-priority redexes - pub const G_NODE_LEN: usize = 1 << 29; // max 536m nodes - pub const G_VARS_LEN: usize = 1 << 29; // max 536m vars - pub const G_RBAG_LEN: usize = NetC::TPC * NetC::RLEN; - - pub fn net(&self) -> &RawNetC { + pub fn net<'a>(&'a self) -> &'a OutputNetC { unsafe { &*self.raw } } - pub fn net_mut(&mut self) -> &mut RawNetC { + pub fn net_mut<'a>(&'a mut self) -> &'a mut OutputNetC { unsafe { &mut *self.raw } } - - fn vars_exchange(&self, var: usize, val: Port) -> Port { - Port(self.net().vars[var].0.swap(val.0, Ordering::Relaxed) as u32) - } - - pub fn vars_take(&self, var: usize) -> Port { - self.vars_exchange(var, Port(0)) - } } #[cfg(feature = "c")] impl Drop for NetC { fn drop(&mut self) { // Deallocate network's memory - let layout = alloc::Layout::new::(); - unsafe { alloc::dealloc(self.raw as *mut u8, layout) }; + unsafe { free_output_net(self.raw); } } } @@ -117,34 +116,22 @@ impl NetReadback for NetC { //println!("{:?}", data); let book_buffer = data.as_mut_ptr() as *mut u32; - let layout = alloc::Layout::new::(); - let net_ptr = unsafe { alloc::alloc(layout) as *mut RawNetC }; - - unsafe { - hvm_c(data.as_mut_ptr() as *mut u32, net_ptr); - } + // Run net + let raw = unsafe { hvm_c(data.as_mut_ptr() as *mut u32, 1) }; - NetC { raw: net_ptr } + NetC { raw } } fn node_load(&self, loc:usize) -> Pair { - Pair(self.net().node[loc].0.load(Ordering::Relaxed)) + unsafe { + Pair((*self.net().node_buf.add(loc)).0.load(Ordering::Relaxed)) + } } - fn enter(&self, mut var: Port) -> Port { - // While `B` is VAR: extend it (as an optimization) - while var.get_tag() == VAR { - // Takes the current `B` substitution as `B'` - let val = self.vars_exchange(var.get_val() as usize, NONE); - // If there was no `B'`, stop, as there is no extension - if val == NONE || val == Port(0) { - break; - } - // Otherwise, delete `B` (we own both) and continue as `A ~> B'` - self.vars_take(var.get_val() as usize); - var = val; + fn vars_exchange(&mut self, var: usize, val: Port) -> Port { + unsafe { + Port((*self.net().vars_buf.add(var)).0.swap(val.0, Ordering::Relaxed) as u32) } - return var; } fn itrs(&self) -> u64 { @@ -221,8 +208,4 @@ impl NetCuda { net.vars_buf[var] = val; old } - - pub fn vars_take(&mut self, var: usize) -> Port { - self.vars_exchange(var, Port(0)) - } } diff --git a/src/main.rs b/src/main.rs index 5d61c95d..20fa9690 100644 --- a/src/main.rs +++ b/src/main.rs @@ -150,7 +150,7 @@ pub fn run(book: &hvm::Book) { let timer = Instant::now(); // Normalize net - let net = N::run(book); + let mut net = N::run(book); // Stops the timer let duration = timer.elapsed(); @@ -158,7 +158,7 @@ pub fn run(book: &hvm::Book) { //println!("{}", net.show()); // Prints the result - if let Some(tree) = ast::Net::readback(&net, book) { + if let Some(tree) = ast::Net::readback(&mut net, book) { println!("Result: {}", tree.show()); } else { println!("Readback failed. Printing GNet memdump...\n"); From 20f1389eefcf003ccf264af34df60d76ded8a9c7 Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Wed, 12 Jun 2024 20:57:01 -0300 Subject: [PATCH 7/7] Working CUDA interop Problem: cudaMemcpy from device to host taking too long, we could try compacting the memory --- src/hvm.c | 2 +- src/hvm.cu | 67 +++++++++++++++++++++----------- src/interop.rs | 101 ++++++++++++++++++++++++++----------------------- src/main.rs | 4 +- 4 files changed, 101 insertions(+), 73 deletions(-) diff --git a/src/hvm.c b/src/hvm.c index cd8c827b..f4abd852 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1765,7 +1765,7 @@ typedef struct OutputNet { a64 itrs; } OutputNet; -void free_output_net(OutputNet* net) { +void free_output_net_c(OutputNet* net) { free((Net*)net->original); free(net); } diff --git a/src/hvm.cu b/src/hvm.cu index 7591cafb..2f87b409 100644 --- a/src/hvm.cu +++ b/src/hvm.cu @@ -2265,16 +2265,6 @@ __global__ void print_result(GNet* gnet) { } } -__global__ void compact(GNet* gnet, Pair* node_out, Port* vars_out) { - // Ideia: percorrer os nós de forma semelhante a `pretty_print_port`, - // colocando eles nos buffers em questão usando uma stack. - - // ??? - // if (threadIdx.x == 0 && blockIdx.x == 0) { - // Port r = vars_take(gnet, ROOT); - // } -} - // Demos // ----- @@ -2289,6 +2279,37 @@ __global__ void compact(GNet* gnet, Pair* node_out, Port* vars_out) { //COMPILED_BOOK_BUF// +// Output Net +// Used by other languages calling `hvm_cu` +// ---- +struct OutputNet { + void *original; + Pair *node_buf; + Port *vars_buf; + u64 itrs; +}; + +OutputNet* create_output_net(GNet* gnet) { + OutputNet* output = (OutputNet*)malloc(sizeof(OutputNet)); + + // Allocate host memory for the net + GNet* h_gnet = (GNet*)malloc(sizeof(GNet)); + + // Copy the net from device to host + cudaMemcpy(h_gnet, gnet, sizeof(GNet), cudaMemcpyDeviceToHost); + + output->original = (void*)h_gnet; + output->node_buf = h_gnet->node_buf; + output->vars_buf = h_gnet->vars_buf; + output->itrs = h_gnet->itrs; + return output; +} + +extern "C" void free_output_net_cuda(OutputNet* net) { + free((GNet*)net->original); + free(net); +} + // Main // ---- @@ -2296,7 +2317,7 @@ __global__ void compact(GNet* gnet, Pair* node_out, Port* vars_out) { void do_run_io(GNet* gnet, Book* book, Port port); #endif -extern "C" void hvm_cu(u32* book_buffer, bool return_output) { +extern "C" OutputNet* hvm_cu(u32* book_buffer, bool return_output) { // Start the timer clock_t start = clock(); @@ -2318,6 +2339,10 @@ extern "C" void hvm_cu(u32* book_buffer, bool return_output) { #ifdef IO do_run_io(gnet, book, ROOT); + // IO actions into `stdout` and `stderr` may appear + // after Rust `print`s if we don't flush + fflush(stdout); + fflush(stderr); #else gnet_normalize(gnet); #endif @@ -2344,11 +2369,6 @@ extern "C" void hvm_cu(u32* book_buffer, bool return_output) { exit(EXIT_FAILURE); } - // If `output` is set, copy the memory from the net into the Rust implementation - if (return_output) { - // cudaMemcpy(output, gnet, sizeof(GNet), cudaMemcpyDeviceToHost); - } - // Prints entire memdump //{ //// Allocate host memory for the net @@ -2374,17 +2394,20 @@ extern "C" void hvm_cu(u32* book_buffer, bool return_output) { // Prints interactions, time and MIPS // If `output` is set, the Rust implementation will print the net - if (!return_output) { - printf("- ITRS: %llu\n", gnet_get_itrs(gnet)); - printf("- LEAK: %llu\n", gnet_get_leak(gnet)); - printf("- TIME: %.2fs\n", duration); - printf("- MIPS: %.2f\n", (double)gnet_get_itrs(gnet) / duration / 1000000.0); + if (return_output) { + return create_output_net(gnet); } + + printf("- ITRS: %llu\n", gnet_get_itrs(gnet)); + printf("- LEAK: %llu\n", gnet_get_leak(gnet)); + printf("- TIME: %.2fs\n", duration); + printf("- MIPS: %.2f\n", (double)gnet_get_itrs(gnet) / duration / 1000000.0); + return NULL; } #ifdef WITH_MAIN int main() { - hvm_cu((u32*)BOOK_BUF, NULL); + hvm_cu((u32*)BOOK_BUF, false); return 0; } #endif diff --git a/src/interop.rs b/src/interop.rs index a42d5b17..f6611b1e 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -6,12 +6,13 @@ use crate::hvm::*; #[cfg(feature = "c")] extern "C" { pub fn hvm_c(book_buffer: *const u32, return_output: u8) -> *mut OutputNetC; - pub fn free_output_net(net: *mut OutputNetC); + pub fn free_output_net_c(net: *mut OutputNetC); } #[cfg(feature = "cuda")] extern "C" { - pub fn hvm_cu(book_buffer: *const u32, return_output: bool); + pub fn hvm_cu(book_buffer: *const u32, return_output: bool) -> *mut OutputNetCuda; + pub fn free_output_net_cuda(net: *mut OutputNetCuda); } // Abstract Global Net @@ -103,7 +104,7 @@ impl NetC { impl Drop for NetC { fn drop(&mut self) { // Deallocate network's memory - unsafe { free_output_net(self.raw); } + unsafe { free_output_net_c(self.raw); } } } @@ -147,65 +148,71 @@ impl NetReadback for NetC { // Problem: CUDA's `GNet` is allocated using `cudaMalloc` // Solution: Write a CUDA kernel to compact GPU memory and then `memcpy` it to RAM -// #[cfg(feature = "cuda")] +#[cfg(feature = "cuda")] #[repr(C)] pub struct NetCuda { - raw: *mut RawNetCuda + raw: *mut OutputNetCuda } -// #[cfg(feature = "cuda")] +#[cfg(feature = "cuda")] #[repr(C)] -pub struct RawNetCuda { - pub rbag_use_a: u32, // total rbag redex count (buffer A) - pub rbag_use_b: u32, // total rbag redex count (buffer B) - pub rbag_buf_a: [Pair; NetCuda::G_RBAG_LEN], // global redex bag (buffer A) - pub rbag_buf_b: [Pair; NetCuda::G_RBAG_LEN], // global redex bag (buffer B) - pub node_buf: [Pair; NetCuda::G_NODE_LEN], // global node buffer - pub vars_buf: [Port; NetCuda::G_VARS_LEN], // global vars buffer - pub node_put: [u32; NetCuda::TPB * NetCuda::BPG], - pub vars_put: [u32; NetCuda::TPB * NetCuda::BPG], - pub rbag_put: [u32; NetCuda::TPB * NetCuda::BPG], - pub mode: u8, // evaluation mode (curr) +pub struct OutputNetCuda { + pub original: *mut std::ffi::c_void, + pub node_buf: *mut Pair, // global node buffer + pub vars_buf: *mut Port, // global vars buffer pub itrs: u64, // interaction count - pub iadd: u64, // interaction count adder - pub leak: u64, // leak count - pub turn: u32, // turn count - pub down: u8, // are we recursing down? - pub rdec: u8, // decrease rpos by 1? } -// #[cfg(feature = "cuda")] +#[cfg(feature = "cuda")] impl NetCuda { - // Constants relevant in the CUDA implementation - // NOTE: If any of these constants are changed in CUDA, they have to be changed here as well. - - // Threads per Block - pub const TPB_L2: usize = 7; - pub const TPB: usize = 1 << NetCuda::TPB_L2; - - // Blocks per GPU - pub const BPG_L2: usize = 7; - pub const BPG: usize = 1 << NetCuda::BPG_L2; - - // Thread Redex Bag Length - pub const RLEN: usize = 256; - - pub const G_NODE_LEN: usize = 1 << 29; // max 536m nodes - pub const G_VARS_LEN: usize = 1 << 29; // max 536m vars - pub const G_RBAG_LEN: usize = NetCuda::TPB * NetCuda::BPG * NetCuda::RLEN * 3; - - pub fn net(&self) -> &RawNetCuda { + pub fn net(&self) -> &OutputNetCuda { unsafe { &*self.raw } } - pub fn net_mut(&mut self) -> &mut RawNetCuda { + pub fn net_mut(&mut self) -> &mut OutputNetCuda { unsafe { &mut *self.raw } } +} + +#[cfg(feature = "cuda")] +impl Drop for NetCuda { + fn drop(&mut self) { + // Deallocate network's memory + unsafe { free_output_net_cuda(self.raw); } + } +} + +#[cfg(feature = "cuda")] +impl NetReadback for NetCuda { + fn run(book: &Book) -> Self { + // Serialize book + let mut data : Vec = Vec::new(); + book.to_buffer(&mut data); + //println!("{:?}", data); + let book_buffer = data.as_mut_ptr() as *mut u32; + + // Run net + let raw = unsafe { hvm_cu(data.as_mut_ptr() as *mut u32, true) }; + + NetCuda { raw } + } + + fn node_load(&self, loc:usize) -> Pair { + unsafe { + Pair((*self.net().node_buf.add(loc)).0) + } + } fn vars_exchange(&mut self, var: usize, val: Port) -> Port { - let net = self.net_mut(); - let old = net.vars_buf[var]; - net.vars_buf[var] = val; - old + unsafe { + let net = self.net_mut(); + let old = *net.vars_buf.add(var); + *net.vars_buf.add(var) = val; + old + } + } + + fn itrs(&self) -> u64 { + self.net().itrs } } diff --git a/src/main.rs b/src/main.rs index 20fa9690..78549858 100644 --- a/src/main.rs +++ b/src/main.rs @@ -79,9 +79,7 @@ fn main() { let mut data : Vec = Vec::new(); book.to_buffer(&mut data); #[cfg(feature = "cuda")] - unsafe { - interop::hvm_cu(data.as_mut_ptr() as *mut u32, std::ptr::null()); - } + run::(&book); #[cfg(not(feature = "cuda"))] println!("CUDA runtime not available!\n If you've installed CUDA and nvcc after HVM, please reinstall HVM."); }