From 869996190dc24b6bca87c2f72c6451e5dcdb0b53 Mon Sep 17 00:00:00 2001
From: msdx321 <msdx321@gmail.com>
Date: Mon, 9 Nov 2020 14:36:15 +0800
Subject: [PATCH] patina_bench: Misc tweaks

Signed-off-by: msdx321 <msdx321@gmail.com>
---
 .../bench_sched_yield/bench_sched_yield.c     | 41 ++++++---
 .../tests/bench_sl_yield/bench_sl_yield.c     | 37 +++++---
 .../patina_chan_bench/patina_chan_bench.c     | 92 +++++++++++--------
 .../patina_chan_bench_inter_recv.c            |  8 +-
 .../patina_chan_bench_inter_send.c            | 40 +++-----
 .../patina_event_bench/patina_event_bench.c   | 17 ++--
 .../patina_mutex_bench/patina_mutex_bench.c   | 69 +++++++++-----
 .../tests/patina_sem_bench/patina_sem_bench.c | 30 ++----
 .../patina_timer_bench/patina_timer_bench.c   | 50 ++++++++--
 src/components/implementation/tests/test.c    |  8 ++
 .../tests/unit_pingpong/Makefile              |  2 +-
 src/components/lib/ubench/perfdata.h          |  6 +-
 12 files changed, 233 insertions(+), 167 deletions(-)
 create mode 100644 src/components/implementation/tests/test.c

diff --git a/src/components/implementation/tests/bench_sched_yield/bench_sched_yield.c b/src/components/implementation/tests/bench_sched_yield/bench_sched_yield.c
index aba0f30143..8d53abe7ae 100644
--- a/src/components/implementation/tests/bench_sched_yield/bench_sched_yield.c
+++ b/src/components/implementation/tests/bench_sched_yield/bench_sched_yield.c
@@ -17,36 +17,47 @@
 
 /* lo and hi is actually running at the same prio */
 #define ITERATION 10000
-/* #define PRINT_ALL */
+#define PRINT_ALL
 
 thdid_t yield_hi = 0, yield_lo = 0;
+
 volatile cycles_t start;
-volatile cycles_t end;
+volatile int count;
 
 struct perfdata perf;
 cycles_t result[ITERATION] = {0, };
 
 /***
- * We're measuring 2-way context switch time. 
+ * We're measuring one-way context switch time. 
  */
 void
 yield_hi_thd(void *d)
 {
-	/* Never stops running; low priority controls how many iters to run. */
-	while (1) {
+	cycles_t end;
+
+	while (count < ITERATION) {
 		debug("h1,");
+
+		start = time_now();
 		sched_thd_yield_to(yield_lo);
+		end = time_now();
+
 		debug("h2,");
+
+		perfdata_add(&perf, end - start);
+
+		count++;
 	}
+
+	while (1) ;
 }
 
 void
 yield_lo_thd(void *d)
 {
-	int i;
-	int first = 0;
+	cycles_t end;
 
-	for (i = 0; i < ITERATION + 1; i++) {
+	while (count < ITERATION) {
 		debug("l1,");
 
 		start = time_now();
@@ -55,16 +66,16 @@ yield_lo_thd(void *d)
 
 		debug("l2,");
 		
-		if (first == 0) first = 1;
-		else perfdata_add(&perf, end - start);
+		perfdata_add(&perf, end - start);
+
+		count++;
 	}
 	
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1) ;
 }
@@ -77,6 +88,8 @@ test_yield(void)
 		SCHED_PARAM_CONS(SCHEDP_PRIO, 6)
 	};
 
+	count = 0;
+
 	perfdata_init(&perf, "Context switch time", result, ITERATION);
 	
 	printc("Create threads:\n");
diff --git a/src/components/implementation/tests/bench_sl_yield/bench_sl_yield.c b/src/components/implementation/tests/bench_sl_yield/bench_sl_yield.c
index 75a27832bc..aa1091a1ea 100644
--- a/src/components/implementation/tests/bench_sl_yield/bench_sl_yield.c
+++ b/src/components/implementation/tests/bench_sl_yield/bench_sl_yield.c
@@ -19,7 +19,7 @@
 
 /* lo and hi is actually running at the same prio */
 #define ITERATION 10000
-/* #define PRINT_ALL */
+#define PRINT_ALL
 
 /* Ensure this is the same as what is in sl_mod_fprr.c */
 #define SL_FPRR_NPRIOS 32
@@ -32,7 +32,7 @@ struct sl_thd *testing_thread;
 thdid_t thdid1, thdid2;
 
 volatile cycles_t start;
-volatile cycles_t end;
+volatile int count;
 
 struct perfdata perf;
 cycles_t result[ITERATION] = {0, };
@@ -40,21 +40,32 @@ cycles_t result[ITERATION] = {0, };
 static void
 thd1_fn()
 {
-	/* Never stops running; low priority controls how many iters to run. */
-	while (1) {
+	cycles_t end;
+
+	while (count < ITERATION) {
 		debug("h1,");
+
+		start = time_now();
 		sl_thd_yield(thdid2);
+		end = time_now();
+
 		debug("h2,");
+
+		perfdata_add(&perf, end - start);
+
+		count++;
 	}
+
+	while (1); 
 }
 
 static void
 thd2_fn()
 {
 	int i;
-	int first = 0;
+	cycles_t end;
 
-	for (i = 0; i < ITERATION + 1; i++) {
+	while (count < ITERATION) {
 		debug("l1,");
 
 		start = time_now();
@@ -63,16 +74,16 @@ thd2_fn()
 
 		debug("l2,");
 		
-		if (first == 0) first = 1;
-		else perfdata_add(&perf, end - start);
+		perfdata_add(&perf, end - start);
+
+		count++;
 	}
 	
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1) ;
 }
@@ -104,6 +115,8 @@ cos_init(void)
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
+	count = 0;
+
 	PRINTC("Thread switch benchmark for the scheduling library (sl)\n");
 	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 	cos_defcompinfo_init();
diff --git a/src/components/implementation/tests/patina_chan_bench/patina_chan_bench.c b/src/components/implementation/tests/patina_chan_bench/patina_chan_bench.c
index 8856ce9467..527902d983 100644
--- a/src/components/implementation/tests/patina_chan_bench/patina_chan_bench.c
+++ b/src/components/implementation/tests/patina_chan_bench/patina_chan_bench.c
@@ -10,6 +10,20 @@
 #include <patina.h>
 #include <perfdata.h>
 
+#define COLD_CACHE
+#ifdef COLD_CACHE
+#define cache_flush() __cache_flush()
+#define COLD_OFFSET 1
+#define COLD_INDEX 0
+#else
+#define cache_flush()
+#define COLD_OFFSET 0
+#define COLD_INDEX -1
+#endif
+
+#define CACHE_SIZE 512 * 1024
+#define CACHE_LINE_SIZE 32
+
 #undef PATINA_CHAN_TRACE_DEBUG
 #ifdef PATINA_CHAN_TRACE_DEBUG
 #define debug(format, ...) printc(format, ##__VA_ARGS__)
@@ -18,11 +32,14 @@
 #endif
 
 /* One low-priority thread and one high-priority thread contends on the lock */
+#ifdef COLD_CACHE
+#define ITERATION 10 * 10
+#else
 #define ITERATION 10 * 1000
-#define PRINT_ALL
+#endif
+#undef PRINT_ALL
 
 /* Two options are available: Sender at low/high prio, data words 4 */
-#undef READER_HIGH
 #define DATA_WORDS 2
 
 thdid_t chan_reader = 0, chan_writer = 0;
@@ -60,6 +77,20 @@ patina_chan_r_t rid2;
 patina_chan_s_t sid;
 patina_chan_s_t sid2;
 
+volatile char pool[CACHE_SIZE * 4] = {
+  0,
+};
+
+void
+__cache_flush()
+{
+	int agg = 1;
+	for (int i = 0; i < CACHE_SIZE * 4; i += CACHE_LINE_SIZE) {
+		pool[i] += agg;
+		agg = pool[i];
+	}
+}
+
 /***
  * The two threads reciprocally sends and receives.
  */
@@ -84,10 +115,10 @@ void
 chan_writer_thd(void *d)
 {
 	int i;
-	int first = 0;
 
-	for (int i = 0; i < ITERATION + 1; i++) {
+	for (int i = 0; i < ITERATION + COLD_OFFSET; i++) {
 		debug("w1,");
+		cache_flush();
 		ts1[0] = time_now();
 		debug("ts1: %d,", ts1[0]);
 		debug("w2,");
@@ -99,35 +130,25 @@ chan_writer_thd(void *d)
 		ts3[0] = time_now();
 		debug("w5,");
 
-		if (first == 0)
-			first = 1;
-		else {
-			if (ts2[0] > ts1[0] && ts3[0] > ts2[0]) {
-				perfdata_add(&perf1, ts2[0] - ts1[0]);
-				perfdata_add(&perf2, ts3[0] - ts2[0]);
-				perfdata_add(&perf3, ts3[0] - ts1[0]);
-			}
+		if (ts2[0] > ts1[0] && ts3[0] > ts2[0] && i != COLD_INDEX) {
+			perfdata_add(&perf1, ts2[0] - ts1[0]);
+			perfdata_add(&perf2, ts3[0] - ts2[0]);
+			perfdata_add(&perf3, ts3[0] - ts1[0]);
 		}
 	}
 
+#ifdef PRINT_ALL
+	perfdata_raw(&perf1);
+	perfdata_raw(&perf2);
+	perfdata_raw(&perf3);
+#endif
 	perfdata_calc(&perf1);
 	perfdata_calc(&perf2);
 	perfdata_calc(&perf3);
-#ifdef PRINT_ALL
-#ifdef READER_HIGH
-	perfdata_all(&perf1);
-#else
-	perfdata_all(&perf2);
-#endif
-	perfdata_all(&perf3);
-#else
-#ifdef READER_HIGH
+
 	perfdata_print(&perf1);
-#else
 	perfdata_print(&perf2);
-#endif
 	perfdata_print(&perf3);
-#endif
 
 	while (1)
 		;
@@ -137,18 +158,13 @@ void
 test_chan(void)
 {
 	int      i;
-	int      first = 0;
 	cycles_t begin, end;
 
-#ifdef READER_HIGH
 	sched_param_t sps[] = {SCHED_PARAM_CONS(SCHEDP_PRIO, 4), SCHED_PARAM_CONS(SCHEDP_PRIO, 6)};
-#else
-	sched_param_t sps[] = {SCHED_PARAM_CONS(SCHEDP_PRIO, 6), SCHED_PARAM_CONS(SCHEDP_PRIO, 4)};
-#endif
 
 	/* Uncontended lock taking/releasing */
 	perfdata_init(&perf1, "Uncontended channel - selfloop", result1, ITERATION);
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION; i++) {
 		begin = time_now();
 
 		debug("send\n");
@@ -157,17 +173,13 @@ test_chan(void)
 		patina_channel_recv(rid, tmp, 1, 0);
 
 		end = time_now();
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf1, end - begin);
+		perfdata_add(&perf1, end - begin);
 	}
-	perfdata_calc(&perf1);
 #ifdef PRINT_ALL
-	perfdata_all(&perf1);
-#else
-	perfdata_print(&perf1);
+	perfdata_raw(&perf1);
 #endif
+	perfdata_calc(&perf1);
+	perfdata_print(&perf1);
 
 	perfdata_init(&perf1, "Contended channel - reader high use this", result1, ITERATION);
 	perfdata_init(&perf2, "Contended channel - writer high use this", result2, ITERATION);
@@ -176,11 +188,11 @@ test_chan(void)
 	printc("Create threads:\n");
 
 	chan_reader = sched_thd_create(chan_reader_thd, NULL);
-	printc("\tcreating reader thread %d at prio %d\n", chan_reader, sps[0]);
+	printc("\tcreating reader thread %d at prio %d\n", chan_reader, sps[1]);
 	sched_thd_param_set(chan_reader, sps[0]);
 
 	chan_writer = sched_thd_create(chan_writer_thd, NULL);
-	printc("\tcreating writer thread %d at prio %d\n", chan_writer, sps[1]);
+	printc("\tcreating writer thread %d at prio %d\n", chan_writer, sps[0]);
 	sched_thd_param_set(chan_writer, sps[1]);
 }
 
diff --git a/src/components/implementation/tests/patina_chan_bench_inter_recv/patina_chan_bench_inter_recv.c b/src/components/implementation/tests/patina_chan_bench_inter_recv/patina_chan_bench_inter_recv.c
index 267e074879..2f291e0269 100644
--- a/src/components/implementation/tests/patina_chan_bench_inter_recv/patina_chan_bench_inter_recv.c
+++ b/src/components/implementation/tests/patina_chan_bench_inter_recv/patina_chan_bench_inter_recv.c
@@ -22,19 +22,13 @@ patina_chan_r_t rid;
 patina_event_t  evt;
 
 /* Keep these settings below consistent with the sender side */
-#undef READER_HIGH
-#define USE_EVTMGR
+#undef USE_EVTMGR
 
 #define TEST_CHAN_ITEM_SZ sizeof(u32_t)
 #define TEST_CHAN_NSLOTS 2
 #define TEST_CHAN_SEND_ID 3
 #define TEST_CHAN_RECV_ID 4
-/* We are the receiver, and we don't care about data gathering */
-#ifdef READER_HIGH
 #define TEST_CHAN_PRIO_SELF 4
-#else
-#define TEST_CHAN_PRIO_SELF 5
-#endif
 
 typedef unsigned int cycles_32_t;
 
diff --git a/src/components/implementation/tests/patina_chan_bench_inter_send/patina_chan_bench_inter_send.c b/src/components/implementation/tests/patina_chan_bench_inter_send/patina_chan_bench_inter_send.c
index 40dc28eeb2..aff9d84926 100644
--- a/src/components/implementation/tests/patina_chan_bench_inter_send/patina_chan_bench_inter_send.c
+++ b/src/components/implementation/tests/patina_chan_bench_inter_send/patina_chan_bench_inter_send.c
@@ -22,8 +22,7 @@ patina_chan_r_t rid;
 patina_event_t  evt;
 
 #define ITERATION 10 * 1000
-#undef READER_HIGH
-#define USE_EVTMGR
+#undef USE_EVTMGR
 #define PRINT_ALL
 
 #define TEST_CHAN_ITEM_SZ sizeof(u32_t)
@@ -31,11 +30,7 @@ patina_event_t  evt;
 #define TEST_CHAN_SEND_ID 4
 #define TEST_CHAN_RECV_ID 3
 /* We are the sender, and we will be responsible for collecting resulting data */
-#ifdef READER_HIGH
 #define TEST_CHAN_PRIO_SELF 5
-#else
-#define TEST_CHAN_PRIO_SELF 4
-#endif
 
 typedef unsigned int cycles_32_t;
 
@@ -56,7 +51,6 @@ main(void)
 	int         i;
 	cycles_t    wakeup;
 	cycles_32_t ts1, ts2, ts3;
-	int         first = 0;
 #ifdef USE_EVTMGR
 	evt_res_id_t   evt_id;
 	evt_res_data_t evtdata;
@@ -83,7 +77,7 @@ main(void)
 	wakeup = time_now() + time_usec2cyc(1000 * 1000);
 	sched_thd_block_timeout(0, wakeup);
 
-	for (int i = 0; i < ITERATION + 1; i++) {
+	for (int i = 0; i < ITERATION; i++) {
 		debug("w1,");
 		ts1 = time_now();
 		debug("ts1: %d,", ts1);
@@ -102,35 +96,25 @@ main(void)
 		ts3 = time_now();
 		debug("w5,");
 
-		if (first == 0)
-			first = 1;
-		else {
-			if (ts2 > ts1 && ts3 > ts2) {
-				perfdata_add(&perf1, ts2 - ts1);
-				perfdata_add(&perf2, ts3 - ts2);
-				perfdata_add(&perf3, ts3 - ts1);
-			}
+		if (ts2 > ts1 && ts3 > ts2) {
+			perfdata_add(&perf1, ts2 - ts1);
+			perfdata_add(&perf2, ts3 - ts2);
+			perfdata_add(&perf3, ts3 - ts1);
 		}
 	}
 
+#ifdef PRINT_ALL
+	perfdata_raw(&perf1);
+	perfdata_raw(&perf2);
+	perfdata_raw(&perf3);
+#endif
 	perfdata_calc(&perf1);
 	perfdata_calc(&perf2);
 	perfdata_calc(&perf3);
-#ifdef PRINT_ALL
-#ifdef READER_HIGH
-	perfdata_all(&perf1);
-#else
-	perfdata_all(&perf2);
-#endif
-	perfdata_all(&perf3);
-#else
-#ifdef READER_HIGH
+
 	perfdata_print(&perf1);
-#else
 	perfdata_print(&perf2);
-#endif
 	perfdata_print(&perf3);
-#endif
 
 	while (1)
 		;
diff --git a/src/components/implementation/tests/patina_event_bench/patina_event_bench.c b/src/components/implementation/tests/patina_event_bench/patina_event_bench.c
index e0baac548c..391a7c9f4f 100644
--- a/src/components/implementation/tests/patina_event_bench/patina_event_bench.c
+++ b/src/components/implementation/tests/patina_event_bench/patina_event_bench.c
@@ -39,29 +39,24 @@ void
 evt_hi_thd(void *d)
 {
 	int i;
-	int first = 0;
 
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION; i++) {
 		debug("h1");
 		patina_event_wait(&evt, NULL, 0);
 		end = time_now();
 
 		debug("h2");
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		perfdata_add(&perf, end - start);
 
 		debug("h3");
 		patina_sem_give(sid);
 	}
 
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1)
 		;
@@ -98,7 +93,7 @@ test_evt(void)
 	printc("Create threads:\n");
 
 	evt_lo = sched_thd_create(evt_lo_thd, NULL);
-	printc("\tcreating lo thread %d at prio %d\n", evt_lo, sps[1]);
+	printc("\tcreating lo thread %d at prio %d\n", evt_lo, sps[0]);
 	sched_thd_param_set(evt_lo, sps[1]);
 
 	evt_hi = sched_thd_create(evt_hi_thd, NULL);
diff --git a/src/components/implementation/tests/patina_mutex_bench/patina_mutex_bench.c b/src/components/implementation/tests/patina_mutex_bench/patina_mutex_bench.c
index 7a1b5cebbf..7c362b1337 100644
--- a/src/components/implementation/tests/patina_mutex_bench/patina_mutex_bench.c
+++ b/src/components/implementation/tests/patina_mutex_bench/patina_mutex_bench.c
@@ -10,6 +10,20 @@
 #include <patina.h>
 #include <perfdata.h>
 
+#define COLD_CACHE
+#ifdef COLD_CACHE
+#define cache_flush() __cache_flush()
+#define COLD_OFFSET 1
+#define COLD_INDEX 0
+#else
+#define cache_flush()
+#define COLD_OFFSET 0
+#define COLD_INDEX -1
+#endif
+
+#define CACHE_SIZE 512 * 1024
+#define CACHE_LINE_SIZE 32
+
 #undef LOCK_TRACE_DEBUG
 #ifdef LOCK_TRACE_DEBUG
 #define debug(format, ...) printc(format, ##__VA_ARGS__)
@@ -18,7 +32,14 @@
 #endif
 
 /* One low-priority thread and one high-priority thread contends on the lock */
+#ifdef COLD_CACHE
+#define ITERATION 10 * 10
+#define SLEEP_TIME 100 * 1000
+#else
 #define ITERATION 10 * 1000
+#define SLEEP_TIME 1000
+#endif
+
 #define PRINT_ALL
 
 patina_mutex_t mid;
@@ -33,6 +54,20 @@ cycles_t        result[ITERATION] = {
   0,
 };
 
+volatile char pool[CACHE_SIZE * 4] = {
+  0,
+};
+
+void
+__cache_flush()
+{
+	int agg = 1;
+	for (int i = 0; i < CACHE_SIZE * 4; i += CACHE_LINE_SIZE) {
+		pool[i] += agg;
+		agg = pool[i];
+	}
+}
+
 /***
  * The high priority thread periodically challenges the lock while the low priority thread keeps spinning.
  * When the low-priority thread detects that the flag is changed, it knows that the lock is challenged.
@@ -45,9 +80,10 @@ lock_hi_thd(void *d)
 	while (1) {
 		debug("h1,");
 		sched_thd_block(0);
-		sched_thd_block_timeout(0, time_now() + time_usec2cyc(1000));
+		sched_thd_block_timeout(0, time_now() + time_usec2cyc(SLEEP_TIME));
 
 		debug("h2,");
+		cache_flush();
 		flag  = 1;
 		start = time_now();
 		patina_mutex_lock(mid);
@@ -62,9 +98,8 @@ void
 lock_lo_thd(void *d)
 {
 	int i;
-	int first = 0;
 
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION + COLD_OFFSET; i++) {
 		debug("l1,");
 		sched_thd_wakeup(lock_hi);
 
@@ -76,19 +111,15 @@ lock_lo_thd(void *d)
 		while (flag != 1) {}
 		patina_mutex_unlock(mid);
 
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		if (i != COLD_INDEX) { perfdata_add(&perf, end - start); }
 		debug("l4,");
 	}
 
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1)
 		;
@@ -98,7 +129,6 @@ void
 test_lock(void)
 {
 	int i;
-	int first = 0;
 
 	sched_param_t sps[] = {SCHED_PARAM_CONS(SCHEDP_PRIO, 4), SCHED_PARAM_CONS(SCHEDP_PRIO, 6)};
 
@@ -106,24 +136,21 @@ test_lock(void)
 
 	/* Uncontended lock taking/releasing */
 	perfdata_init(&perf, "Uncontended lock - take+release", result, ITERATION);
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION + COLD_OFFSET; i++) {
+		cache_flush();
 		start = time_now();
 
 		patina_mutex_lock(mid);
 		patina_mutex_unlock(mid);
 
 		end = time_now();
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		if (i != COLD_INDEX) { perfdata_add(&perf, end - start); }
 	}
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	perfdata_init(&perf, "Contended lock - take+release", result, ITERATION);
 
diff --git a/src/components/implementation/tests/patina_sem_bench/patina_sem_bench.c b/src/components/implementation/tests/patina_sem_bench/patina_sem_bench.c
index 23c7030f60..2452f14ca5 100644
--- a/src/components/implementation/tests/patina_sem_bench/patina_sem_bench.c
+++ b/src/components/implementation/tests/patina_sem_bench/patina_sem_bench.c
@@ -63,9 +63,8 @@ void
 sem_lo_thd(void *d)
 {
 	int i;
-	int first = 0;
 
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION; i++) {
 		debug("l1");
 		sched_thd_wakeup(sem_hi);
 
@@ -78,19 +77,15 @@ sem_lo_thd(void *d)
 
 		patina_sem_give(sid);
 
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		perfdata_add(&perf, end - start);
 		debug("l4");
 	}
 
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1)
 		;
@@ -100,7 +95,6 @@ void
 test_sem(void)
 {
 	int      i;
-	int      first = 0;
 	cycles_t start, end;
 
 	sched_param_t sps[] = {SCHED_PARAM_CONS(SCHEDP_PRIO, 4), SCHED_PARAM_CONS(SCHEDP_PRIO, 6)};
@@ -109,24 +103,20 @@ test_sem(void)
 
 	/* Uncontended semaphore taking/releasing */
 	perfdata_init(&perf, "Uncontended semaphore - take+give", result, ITERATION);
-	for (i = 0; i < ITERATION + 1; i++) {
+	for (i = 0; i < ITERATION; i++) {
 		start = time_now();
 
 		patina_sem_take(sid);
 		patina_sem_give(sid);
 
 		end = time_now();
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		perfdata_add(&perf, end - start);
 	}
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	perfdata_init(&perf, "Contended semaphore - take+give", result, ITERATION);
 
diff --git a/src/components/implementation/tests/patina_timer_bench/patina_timer_bench.c b/src/components/implementation/tests/patina_timer_bench/patina_timer_bench.c
index 8d475cc300..65948b5e3f 100644
--- a/src/components/implementation/tests/patina_timer_bench/patina_timer_bench.c
+++ b/src/components/implementation/tests/patina_timer_bench/patina_timer_bench.c
@@ -10,6 +10,20 @@
 #include <patina.h>
 #include <perfdata.h>
 
+#define COLD_CACHE
+#ifdef COLD_CACHE
+#define cache_flush() __cache_flush()
+#define COLD_OFFSET 1
+#define COLD_INDEX 0
+#else
+#define cache_flush()
+#define COLD_OFFSET 0
+#define COLD_INDEX -1
+#endif
+
+#define CACHE_SIZE 512 * 1024
+#define CACHE_LINE_SIZE 32
+
 #undef TMR_TRACE_DEBUG
 #ifdef TMR_TRACE_DEBUG
 #define debug(format, ...) printc(format, ##__VA_ARGS__)
@@ -18,8 +32,14 @@
 #endif
 
 /* High-priority thread interrupts the low-priority thread by timer ticks */
+#ifdef COLD_OFFSET
+#define ITERATION 10 * 10
+#define TMR_PERIODIC_TIME 1000 * 1000
+#else 
 #define ITERATION 10 * 1000
 #define TMR_PERIODIC_TIME 10 * 1000
+#endif
+
 #define DROP_THRESHOLD 0x1000000U
 
 #define PRINT_ALL
@@ -35,6 +55,20 @@ cycles_t        result[ITERATION] = {
   0,
 };
 
+volatile char pool[CACHE_SIZE * 4] = {
+  0,
+};
+
+void
+__cache_flush()
+{
+	int agg = 1;
+	for (int i = 0; i < CACHE_SIZE * 4; i += CACHE_LINE_SIZE) {
+		pool[i] += agg;
+		agg = pool[i];
+	}
+}
+
 /***
  * The high priority thread sets up a periodic timer while the low priority thread keeps looping and updating
  * the timing value variable. The variable is a 32-bit one so that it can be updated atomically. We always
@@ -47,7 +81,6 @@ tmr_hi_thd(void *d)
 	patina_time_t  t;
 	patina_timer_t tid;
 	patina_event_t evt;
-	int            first = 0;
 
 	printc("Call into timer manager to make a timer.\n");
 	tid = patina_timer_create();
@@ -67,27 +100,24 @@ tmr_hi_thd(void *d)
 
 	/* Event loop */
 	i = 0;
-	while (i < ITERATION + 1) {
+	while (i < ITERATION + COLD_OFFSET) {
+		cache_flush();
 		patina_event_wait(&evt, NULL, 0);
 		end = (cycles_32_t)time_now();
 
 		if ((end - start) > DROP_THRESHOLD) continue;
 
-		if (first == 0)
-			first = 1;
-		else
-			perfdata_add(&perf, end - start);
+		if (i != COLD_INDEX) { perfdata_add(&perf, end - start); }
 		debug("%lld.\n", end - start);
 
 		i++;
 	}
 
-	perfdata_calc(&perf);
 #ifdef PRINT_ALL
-	perfdata_all(&perf);
-#else
-	perfdata_print(&perf);
+	perfdata_raw(&perf);
 #endif
+	perfdata_calc(&perf);
+	perfdata_print(&perf);
 
 	while (1)
 		;
diff --git a/src/components/implementation/tests/test.c b/src/components/implementation/tests/test.c
new file mode 100644
index 0000000000..33b130abcc
--- /dev/null
+++ b/src/components/implementation/tests/test.c
@@ -0,0 +1,8 @@
+#include <stdio.h>
+
+int main()
+{
+	void *p = 0;
+	printf("%p %p\n", p, p + 1);
+	return 0;
+}
diff --git a/src/components/implementation/tests/unit_pingpong/Makefile b/src/components/implementation/tests/unit_pingpong/Makefile
index 6a48f05d9d..403512929a 100644
--- a/src/components/implementation/tests/unit_pingpong/Makefile
+++ b/src/components/implementation/tests/unit_pingpong/Makefile
@@ -9,7 +9,7 @@ INTERFACE_EXPORTS =
 INTERFACE_DEPENDENCIES = init pong
 # The library dependencies this component is reliant on for
 # compilation/linking (this is a list of directory names in lib/)
-LIBRARY_DEPENDENCIES = kernel ps
+LIBRARY_DEPENDENCIES = kernel ps ubench
 # Note: Both the interface and library dependencies should be
 # *minimal*. That is to say that removing a dependency should cause
 # the build to fail. The build system does not validate this
diff --git a/src/components/lib/ubench/perfdata.h b/src/components/lib/ubench/perfdata.h
index b60bcbb11e..4c1d633c37 100644
--- a/src/components/lib/ubench/perfdata.h
+++ b/src/components/lib/ubench/perfdata.h
@@ -229,7 +229,7 @@ perfdata_99ptile(struct perfdata *pd)
 static void
 perfdata_print(struct perfdata *pd)
 {
-	printc("PD:%s -sz:%d,SD:%llu,Mean:%llu,99%%:%llu, Max: %llu\n", 
+	printc("#PD:%s -sz:%d,SD:%llu,Mean:%llu,99%%:%llu, Max: %llu\n", 
 		pd->name, pd->sz, pd->sd, pd->avg, pd->ptiles[PTILE_99], pd->max);
 }
 
@@ -240,12 +240,12 @@ perfdata_all(struct perfdata *pd)
 
 	perfdata_print(pd);
 	
-	printc(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n\n");
+	printc("#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n\n");
 	
 	printc("#Latency\n");
 	for (i = 0 ; i < pd->sz ; i++) printc("V: %llu\n", pd->values[i]);
 	
-	printc("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
+	printc("#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
 }
 
 static void