diff --git a/Makefile b/Makefile
index b80f1abe..a6962483 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,6 @@ DEPFLAGS       += -MD -MF $(SRCDIR)/$*.d
 
 SOURCE         := $(wildcard $(SRCDIR)/*.cpp)
 HEADERS        += $(wildcard $(SRCDIR)/*.h)
-HEADERS        += $(wildcard $(SRCDIR)/*.hpp)
 
 OBJ            := $(SOURCE:.cpp=.o)
 DEPS           := $(SOURCE:.cpp=.d)
@@ -32,11 +31,10 @@ DEPS           := $(SOURCE:.cpp=.d)
 # Install variables
 
 SCRIPT_DIR      = scripts/flitcli
-DATA_DIR        = $(SCRIPT_DIR)/data
+DATA_DIR        = data
 CONFIG_DIR      = $(SCRIPT_DIR)/config
 DOC_DIR         = documentation
 LITMUS_TESTS   += $(wildcard litmus-tests/tests/*.cpp)
-LITMUS_TESTS   += $(wildcard litmus-tests/tests/*.hpp)
 LITMUS_TESTS   += $(wildcard litmus-tests/tests/*.h)
 
 INSTALL_FLIT_CONFIG = $(PREFIX)/share/flit/scripts/flitconfig.py
@@ -92,6 +90,7 @@ install: $(TARGET)
 	mkdir -m 0755 -p $(PREFIX)/share/flit/scripts
 	mkdir -m 0755 -p $(PREFIX)/share/flit/doc
 	mkdir -m 0755 -p $(PREFIX)/share/flit/data/tests
+	mkdir -m 0755 -p $(PREFIX)/share/flit/data/db
 	mkdir -m 0755 -p $(PREFIX)/share/flit/config
 	mkdir -m 0755 -p $(PREFIX)/share/flit/litmus-tests
 	ln -sf ../share/flit/scripts/flit.py $(PREFIX)/bin/flit
@@ -105,6 +104,7 @@ install: $(TARGET)
 	install -m 0644 $(DATA_DIR)/custom.mk $(PREFIX)/share/flit/data/
 	install -m 0644 $(DATA_DIR)/main.cpp $(PREFIX)/share/flit/data/
 	install -m 0644 $(DATA_DIR)/tests/Empty.cpp $(PREFIX)/share/flit/data/tests/
+	install -m 0644 $(DATA_DIR)/db/tables-psql.sql $(PREFIX)/share/flit/data/db/
 	install -m 0644 $(CONFIG_DIR)/flit-default.toml.in $(PREFIX)/share/flit/config/
 	install -m 0644 $(LITMUS_TESTS) $(PREFIX)/share/flit/litmus-tests/
 	@echo "Generating $(INSTALL_FLIT_CONFIG)"
diff --git a/scripts/flitcli/data/Makefile.in b/data/Makefile.in
similarity index 70%
rename from scripts/flitcli/data/Makefile.in
rename to data/Makefile.in
index 49ba813e..d4afe49a 100644
--- a/scripts/flitcli/data/Makefile.in
+++ b/data/Makefile.in
@@ -1,9 +1,10 @@
 # Autogenerated Makefile using "flit update"
 
-DEV_CC          := {compiler}
 FFLAGS          ?=
 DEV_TARGET      ?= devrun
 DEV_CUTARGET    ?= cu_devrun
+GT_TARGET       ?= gtrun
+GT_OUT          := ground-truth.csv
 
 UNAME_S         := $(shell uname -s)
 
@@ -22,6 +23,14 @@ DEV_CFLAGS      += -Wextra
 DEV_CFLAGS      += -Wuninitialized
 DEV_CFLAGS      += -Wno-shift-count-overflow
 
+DEV_CC          ?= {dev_compiler}
+DEV_OPTL        ?= {dev_optl}
+DEV_SWITCHES    ?= {dev_switches}
+
+GT_CC           := {ground_truth_compiler}
+GT_OPTL         := {ground_truth_optl}
+GT_SWITCHES     := {ground_truth_switches}
+
 LD_REQUIRED     += -lm
 LD_REQUIRED     += -lstdc++
 ifeq ($(UNAME_S),Darwin) # If we are on a Mac OSX system
@@ -39,8 +48,10 @@ TESTS           := $(wildcard tests/*.cpp)
 SOURCE          := $(wildcard *.cpp)
 SOURCE          += $(TESTS)
 
-DEV_OBJ         := $(SOURCE:%.cpp=%_dev.o)
-DEV_DEPS        := $(SOURCE:%.cpp=%_dev.d)
+DEV_OBJ          = $(SOURCE:%.cpp=%_dev.o)
+DEV_DEPS         = $(SOURCE:%.cpp=%_dev.d)
+GT_OBJ           = $(SOURCE:%.cpp=%_gt.o)
+GT_DEPS          = $(SOURCE:%.cpp=%_gt.d)
 
 CLANG           := clang++
 INTEL           := icpc
@@ -194,18 +205,21 @@ SWITCHES_INTEL  += USEFASTM
 TARGETS         := $(foreach c, $(COMPILERS), \
                      $(foreach s, $(SWITCHES_$(strip $c)), \
                        $(foreach o, $(OPCODES), \
-                         $c_$(HOSTNAME)_$(strip $(s))_$(strip $(o))_out \
+                         $(RESULTS_DIR)/$c_$(HOSTNAME)_$(strip $s)_$(strip $o)_out.csv \
                         ) \
                       ) \
                     )
-BIN             := $(TARGETS:%_out=%)
-OBJ              = $(foreach b,$(BIN),$(SOURCE:%.cpp=%_$(strip $(b)).o))
+BIN             := $(TARGETS:%_out.csv=%)
+OBJ              = $(foreach b,$(BIN),$(SOURCE:%.cpp=%_$(notdir $b).o))
 
 ##################################################
 #
 # Now deal with CUDA stuff if it is even available
 #
 ##################################################
+
+# TODO: double check CUDA flags.  Really?  No optimization levels?
+
 NVCC_BIN        := nvcc
 NVCC            := $(shell which $(NVCC_BIN))
 CUDA_DIR        := $(dir $(NVCC))/..
@@ -272,13 +286,12 @@ CUSWITCHES      += PRECSFC
 CUSWITCHES      += PRECSTC
 
 CUTARGETS       := $(foreach s, $(CUSWITCHES), \
-                     NVCC_$(HOSTNAME)_$(strip $(s))_out)
-CUBIN           := $(CUTARGETS:%_out=%)
-CUOBJ           := $(foreach b,$(CUBIN),$(CUSOURCE:%.cpp=%_$b.o))
+                     $(RESULTS_DIR)/NVCC_$(HOSTNAME)_$(strip $(s))_out.csv)
+CUBIN           := $(CUTARGETS:%_out.csv=%)
+CUOBJ           := $(foreach b,$(CUBIN),$(CUSOURCE:%.cpp=%_$(notdir $b).o))
 
 endif # ifdef HAS_CUDA
 
-
 .PHONY: help
 help:
 	@echo 'You can run the Makefile directly, but it is recommended to use'
@@ -293,42 +306,50 @@ help:
 	@echo '  help        Show this help and exit (default target)'
 	@echo '  dev         Only run the devel compilation to test things out'
 	@echo '  devcuda     Only run the devel CUDA compilation to test CUDA out'
+	@echo '  groundtruth Compile the ground-truth version and get its output'
+	@echo '  gt          Same as groundtruth'
 	@echo '  run         Run all combinations of compilation, results in results/'
 	@echo '  clean       Clean intermediate files'
 	@echo '  veryclean   Runs clean + removes targets and results'
 	@echo '  distclean   Same as veryclean'
 	@echo
 
-.PHONY: dev devcuda run
+.PHONY: dev devcuda gt groundtruth run
 dev: $(DEV_TARGET)
 devcuda: $(DEV_CUTARGET)
-run: $(TARGETS) $(CUTARGETS) cleanResults archive
+gt: $(GT_TARGET) $(GT_OUT)
+groundtruth: $(GT_TARGET) $(GT_OUT)
+
+run: $(TARGETS) $(CUTARGETS)
 
 .PHONY: clean
 clean:
 	rm -f $(DEV_OBJ)
 	rm -f $(DEV_DEPS)
 	rm -f $(DEV_CUOBJ)
-	rm -f $(TARGETS)
-	rm -f $(CUTARGETS)
-	rm -f $(BIN)
 	rm -f $(OBJ)
-	rm -f $(CUBIN)
 	rm -f $(CUOBJ)
+	rm -f $(GT_OBJ)
+	rm -f $(GT_DEPS)
 
 .PHONY: veryclean distclean
 veryclean: distclean
 distclean: clean
 	rm -f $(DEV_TARGET)
 	rm -f $(DEV_CUTARGET)
-	rm -f $(TARGETS:%=$(RESULTS_DIR)/%)
-	rm -f $(BIN:%=$(RESULTS_DIR)/%)
-	rm -f $(CUTARGETS:%=$(RESULTS_DIR)/%)
-	rm -f $(CUBIN:%=$(RESULTS_DIR)/%)
+	rm -f $(TARGETS)
+	rm -f $(addsuffix *.dat,$(TARGETS))
+	rm -f $(BIN)
+	rm -f $(CUTARGETS)
+	rm -f $(addsuffix *.dat,$(CUTARGETS))
+	rm -f $(CUBIN)
+	rm -f $(GT_TARGET)
+	rm -f $(GT_OUT)
+	rm -f $(addsuffix *.dat,$(GT_OUT))
 	-rmdir $(RESULTS_DIR)
 
 .PRECIOUS: %.d
--include $(SOURCE:%.cpp=%.d)
+-include $(SOURCE:%.cpp=%.d) $(DEV_DEPS) $(GT_DEPS)
 
 Makefile: flit-config.toml $(dir $(FLIT_SCRIPT))/flit_update.py
 	$(FLIT_SCRIPT) update
@@ -345,6 +366,7 @@ cleanlibflit:
 	rm -rf lib
 
 $(DEV_TARGET): lib/libflit.so
+$(GT_TARGET): lib/libflit.so
 $(BIN): lib/libflit.so
 $(CUBIN): lib/libflit.so
 endif # ifeq ($(UNAME_S),Darwin): meaning, we are on a mac
@@ -360,7 +382,12 @@ $(DEV_TARGET): $(DEV_OBJ) Makefile
 	  -o $@ $(DEV_OBJ) $(LD_REQUIRED) $(DEV_LDFLAGS)
 
 %_dev.o: %.cpp Makefile
-	$(DEV_CC) $(CC_REQUIRED) $(DEV_CFLAGS) $(DEPFLAGS) -c $< -o $@
+	$(DEV_CC) $(DEV_OPTL) $(DEV_SWITCHES) $(CC_REQUIRED) $(DEV_CFLAGS) $(DEPFLAGS) -c $< -o $@ \
+	  -DFLIT_HOST='"$(HOSTNAME)"'         \
+	  -DFLIT_COMPILER='"$(DEV_CC)"'       \
+	  -DFLIT_OPTL='"$(DEV_OPTL)"'         \
+	  -DFLIT_SWITCHES='"$(DEV_SWITCHES)"' \
+	  -DFLIT_FILENAME='"$(notdir $(DEV_TARGET))"'
 
 ifdef HAS_CUDA
 $(DEV_CUTARGET): $(DEV_CUOBJ) Makefile
@@ -370,10 +397,27 @@ $(DEV_CUTARGET): $(DEV_CUOBJ) Makefile
 	$(NVCC) -c $(NVCC_CFLAGS) $(DEV_NVCC_CC) $< -o $@
 endif # ifdef HAS_CUDA
 
+# Ground truth compilation rules
+$(GT_OUT): $(GT_TARGET)
+	./$(GT_TARGET) --output $(GT_OUT)
+
+$(GT_TARGET): $(GT_OBJ) Makefile
+	$(GT_CC) $(CC_REQUIRED) -o $@ $(GT_OBJ) $(LD_REQUIRED)
+
+%_gt.o: %.cpp Makefile
+	$(GT_CC) $(GT_OPTL) $(GT_SWITCHES) $(CC_REQUIRED) $(DEPFLAGS) -c $< -o $@ \
+	  -DFLIT_HOST='"$(HOSTNAME)"'         \
+	  -DFLIT_COMPILER='"$(GT_CC)"'       \
+	  -DFLIT_OPTL='"$(GT_OPTL)"'         \
+	  -DFLIT_SWITCHES='"$(GT_SWITCHES)"' \
+	  -DFLIT_FILENAME='"$(notdir $(GT_TARGET))"'
+
 # Now the true magic begins for the full run compilation rules
 # very cool: recyclable target definitions! adapted from
 # http://make.mad-scientist.net/the-eval-function/
 
+# TODO: try without PERCENT variable
+PERCENT         := %
 
 # Generates rules for
 #   1. compiling
@@ -386,34 +430,34 @@ endif # ifdef HAS_CUDA
 # @param $2: variable name containing the compiler to use (e.g. GCC)
 # @param $3: variable name containing the optimization level (e.g. O2)
 define TARGETS_RULE
-#run test and collect results
-$(strip $2)_$(HOSTNAME)_$(strip $1)_$(strip $3)_out : $2_$(HOSTNAME)_$(strip $1)_$(strip $3)
-	-./$$< --output $$@
-	-sed -i -e 's/HOST/$(HOSTNAME)/g' $$@
-	-sed -i -e 's/SWITCHES/$($(strip $1))/g' $$@
-	-sed -i -e 's/OPTL/$($(strip $3))/g' $$@
-	-sed -i -e 's/COMPILER/$($(strip $2))/g' $$@
-	-sed -i -e 's/FILENAME/$$</g' $$@
-	@# Remove backup files from older versions of sed
-	rm -f $$@-e
-
-#link
-$(strip $2)_$(HOSTNAME)_$(strip $1)_$(strip $3) : $(SOURCE:%.cpp=%_$(strip $2)_$(HOSTNAME)_$(strip $1)_$(strip $3).o)
-	-$($(strip $2)) $($(strip $1)) $($(strip $3)) $($(strip $2)_REQUIRED)\
-	   $(CC_REQUIRED) $$^ -o $$@ $(LD_REQUIRED)
-	rm -f $(SOURCE:.cpp=_$(strip $2)_$(HOSTNAME)_$(strip $1)_$(strip $3).o)
-
-#compile
-%_$(strip $2)_$(HOSTNAME)_$(strip $1)_$(strip $3).o : %.cpp
-	-$($(strip $2)) -c $($(strip $1)) $($(strip $3)) $(CC_REQUIRED) \
-	   $($(strip $2)_REQUIRED) $$< -o $$@
+# run test and collect results
+$$(RESULTS_DIR)/$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3)_out.csv: $$(RESULTS_DIR)/$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3) $$(GT_OUT)
+	-./$$< --output $$@ --ground-truth $$(GT_OUT)
+
+# link
+$$(RESULTS_DIR)/$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3) : $$(SOURCE:$(PERCENT).cpp=$(PERCENT)_$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3).o)
+	mkdir -p $$(RESULTS_DIR)
+	-$$($(strip $2)) $$($(strip $1)) $$($(strip $3)) $$($(strip $2)_REQUIRED) \
+	$$(CC_REQUIRED) $$^ -o $$@ $$(LD_REQUIRED)
+	rm -f $$(SOURCE:.cpp=_$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3).o)
+
+# TODO: set FLIT_COMPILER to the compiler name, not the executable used
+# compile
+%_$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3).o : %.cpp
+	-$$($(strip $2)) -c $$($(strip $1)) $$($(strip $3)) $$(CC_REQUIRED) \
+	   $$($(strip $2)_REQUIRED) $$< -o $$@                           \
+	   -DFLIT_HOST='"$$(HOSTNAME)"'                                  \
+	   -DFLIT_COMPILER='"$$($(strip $2))"'                           \
+	   -DFLIT_OPTL='"$$($(strip $3))"'                               \
+	   -DFLIT_SWITCHES='"$$($(strip $1))"'                           \
+	   -DFLIT_FILENAME='"$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3)"'
 endef
 # end of define TARGETS_RULE
 
 # Define individual rules for all elements of $(TARGETS)
-$(foreach c, $(COMPILERS), \
-  $(foreach s, $(SWITCHES_$(strip $c)), \
-    $(foreach o, $(OPCODES), \
+$(foreach c, $(COMPILERS),                \
+  $(foreach s, $(SWITCHES_$(strip $c)),   \
+    $(foreach o, $(OPCODES),              \
       $(eval $(call TARGETS_RULE, $s, $c, $o)))))
 
 
@@ -427,40 +471,26 @@ $(foreach c, $(COMPILERS), \
 #            (e.g. UNSOPTS for --funsafe-math-optimizations)
 define CUTARGETS_RULE
 #run test
-NVCC_$(HOSTNAME)_$(strip $1)_out : NVCC_$(HOSTNAME)_$(strip $1)
-	./$$< --output $$@
-	-sed -i -e 's/HOST/$(HOSTNAME)/g' $$@
-	-sed -i -e 's/SWITCHES/$($(strip $1))/g' $$@
-	-sed -i -e 's/OPTL//g' $$@
-	-sed -i -e 's/COMPILER/NVCC/g' $$@
-	-sed -i -e 's/FILENAME/$$</g' $$@
-	@# Remove backup files from older versions of sed
-	rm -f $$@-e
+NVCC_$$(HOSTNAME)_$(strip $1)_out.csv : NVCC_$$(HOSTNAME)_$(strip $1) $$(GT_OUT)
+	./$$< --output $$@ --ground-truth $$(GT_OUT)
 
 #link test
-NVCC_$(HOSTNAME)_$(strip $1) : $(CUSOURCE:%.cpp=%_NVCC_$(HOSTNAME)_$(strip $1).o)
-	$(NVCC) $($(strip $1)) $(NVCC_LINK) $$^ -o $$@
-	rm -f $(CUSOURCE:%.cpp=%_NVCC_$(HOSTNAME)_$(strip $1).o)
+NVCC_$$(HOSTNAME)_$(strip $1) : $$(CUSOURCE:$(PERCENT).cpp=$(PERCENT)_NVCC_$$(HOSTNAME)_$(strip $1).o)
+	$$(NVCC) $$($(strip $1)) $$(NVCC_LINK) $$^ -o $$@
+	rm -f $$(CUSOURCE:$(PERCENT).cpp=$(PERCENT)_NVCC_$$(HOSTNAME)_$(strip $1).o)
 
 #compile test
-%_NVCC_$(HOSTNAME)_$(strip $1).o : %.cpp
-	-$(NVCC) -c $($(strip $1)) $(NVCC_CFLAGS) $$< -o $$@
+%_NVCC_$$(HOSTNAME)_$(strip $1).o : %.cpp
+	-$$(NVCC) -c $$($(strip $1)) $$(NVCC_CFLAGS) $$< -o $$@          \
+	   -DFLIT_HOST='"$$(HOSTNAME)"'                                  \
+	   -DFLIT_COMPILER='"$$(NVCC)"'                                  \
+	   -DFLIT_OPTL='"$$($(strip $3))"'                               \
+	   -DFLIT_SWITCHES='"$$($(strip $1))"'                           \
+	   -DFLIT_FILENAME='"$$(strip $2)_$$(HOSTNAME)_$(strip $1)_$(strip $3)"'
 endef
 # end of def CUTARGETS_RULE
 
 # define individual rules for all elements of $(CUTARGETS)
 $(foreach s, $(CUSWITCHES), $(eval $(call CUTARGETS_RULE, $s)))
 
-# strip excess info from Intel output (__file__) and remove emtpy outputs
-.PHONY : cleanResults
-cleanResults : $(TARGETS) $(CUTARGETS)
-	sed -i -e 's/FPTests:://' *_$(HOSTNAME)_*_out
-	@# Remove backup files from older versions of sed
-	rm -f *_$(HOSTNAME)_*_out-e
-
-.PHONY : archive
-archive : cleanResults
-	mkdir -p $(RESULTS_DIR)
-	mv *$(HOSTNAME)* $(RESULTS_DIR)
-
 -include custom.mk
diff --git a/scripts/flitcli/data/custom.mk b/data/custom.mk
similarity index 90%
rename from scripts/flitcli/data/custom.mk
rename to data/custom.mk
index 59f9e981..81e208a5 100644
--- a/scripts/flitcli/data/custom.mk
+++ b/data/custom.mk
@@ -23,6 +23,8 @@ CC_REQUIRED    +=
 LD_REQUIRED    +=
 
 # compiler and linker flags respectively - specifically for a dev build
+# - DEV_CFLAGS:   non-recorded compiler flags (such as includes)
+# - DEV_LDFLAGS:  linker flags (also not under test)
 DEV_CFLAGS     +=
 DEV_LDFLAGS    +=
 
diff --git a/db/CreateFlitDBInstaller.sh b/data/db/CreateFlitDBInstaller.sh
similarity index 100%
rename from db/CreateFlitDBInstaller.sh
rename to data/db/CreateFlitDBInstaller.sh
diff --git a/db/InstallFlitDB.sh b/data/db/InstallFlitDB.sh
similarity index 100%
rename from db/InstallFlitDB.sh
rename to data/db/InstallFlitDB.sh
diff --git a/db/InstallFlitDB.sh.in b/data/db/InstallFlitDB.sh.in
similarity index 100%
rename from db/InstallFlitDB.sh.in
rename to data/db/InstallFlitDB.sh.in
diff --git a/db/README b/data/db/README
similarity index 100%
rename from db/README
rename to data/db/README
diff --git a/db/dump_flit.sh b/data/db/dump_flit.sh
similarity index 100%
rename from db/dump_flit.sh
rename to data/db/dump_flit.sh
diff --git a/db/matplotlibrc b/data/db/matplotlibrc
similarity index 100%
rename from db/matplotlibrc
rename to data/db/matplotlibrc
diff --git a/db/psql_commands/create_cleanupResults b/data/db/psql_commands/create_cleanupResults
similarity index 100%
rename from db/psql_commands/create_cleanupResults
rename to data/db/psql_commands/create_cleanupResults
diff --git a/db/psql_commands/create_importFLiTResults b/data/db/psql_commands/create_importFLiTResults
similarity index 100%
rename from db/psql_commands/create_importFLiTResults
rename to data/db/psql_commands/create_importFLiTResults
diff --git a/db/psql_commands/create_import_switches.py b/data/db/psql_commands/create_import_switches.py
similarity index 100%
rename from db/psql_commands/create_import_switches.py
rename to data/db/psql_commands/create_import_switches.py
diff --git a/db/psql_commands/import_test b/data/db/psql_commands/import_test
similarity index 100%
rename from db/psql_commands/import_test
rename to data/db/psql_commands/import_test
diff --git a/db/python/__init__.py b/data/db/python/__init__.py
similarity index 100%
rename from db/python/__init__.py
rename to data/db/python/__init__.py
diff --git a/db/python/plotting.py b/data/db/python/plotting.py
similarity index 100%
rename from db/python/plotting.py
rename to data/db/python/plotting.py
diff --git a/db/setup_db_host.sh b/data/db/setup_db_host.sh
similarity index 100%
rename from db/setup_db_host.sh
rename to data/db/setup_db_host.sh
diff --git a/db/tables.sql b/data/db/tables-psql.sql
similarity index 100%
rename from db/tables.sql
rename to data/db/tables-psql.sql
diff --git a/data/db/tables-sqlite.sql b/data/db/tables-sqlite.sql
new file mode 100644
index 00000000..753110b5
--- /dev/null
+++ b/data/db/tables-sqlite.sql
@@ -0,0 +1,61 @@
+--
+-- Commands used to create the tables for flit in an sqlite3 database.
+--
+
+-- The foreign key support is off by default.  We want this functionality.
+-- According to the documentation, it needs to be turned on every time at
+-- runtime.
+PRAGMA foreign_keys = ON;
+
+--
+-- Table: runs
+--
+-- Stores the id of the run and a message to identify which run it is as well
+-- as the date and time of when it started.
+--
+CREATE TABLE IF NOT EXISTS runs (
+  -- The run id used in tests table
+  id             integer    primary key autoincrement     not null,
+
+  -- timestamp is supported in python if you do the following:
+  --   conn = sqlite3.connect("flit.sqlite",
+  --                          detect_types=sqlite3.PARSE_DECLTYPES)
+  -- The secret sauce is in the "detect_types" that allows python to intercept
+  -- it and convert it to a sqlite3 basic type and back.
+  rdate          timestamp,
+
+  -- The message describing what this run is all about
+  notes          text
+  );
+
+--
+-- Table: tests
+--
+-- Stores the information of compilation and results of FLiT tests as well as
+-- timing information.
+--
+CREATE TABLE IF NOT EXISTS tests (
+  id             integer    primary key autoincrement     not null,
+  run            integer,   -- run index from runs table
+  name           varchar,   -- name of the test case
+  host           varchar,   -- name of computer that ran the test
+  compiler       varchar,   -- compiler name
+  optl           varchar,   -- optimization level (e.g. "-O2")
+  switches       varchar,   -- compiler flag(s) (e.g. "-ffast-math")
+  precision      varchar,   -- precision (f = float, d = double, e = extended)
+  comparison     varchar,   -- metric of comparison - hex value
+  comparison_d   real,      -- metric of comparison of result vs ground truth
+  file           varchar,   -- filename of test executable
+  nanosec        integer    check(nanosec >= 0),  -- timing for the function
+
+  foreign key(run) references runs(id)
+  );
+
+-- Tables not created:
+-- * clusters
+-- * op_counts
+-- * opcodes
+-- * skipped_pin
+-- * switch_conv
+-- * switch_desc
+-- Do we need these tables?  I don't know.
diff --git a/scripts/flitcli/data/main.cpp b/data/main.cpp
similarity index 56%
rename from scripts/flitcli/data/main.cpp
rename to data/main.cpp
index 6b4798bc..60a4a640 100644
--- a/scripts/flitcli/data/main.cpp
+++ b/data/main.cpp
@@ -1,5 +1,5 @@
 #include "flit.h"
 
 int main(int argCount, char* argList[]) {
-  return runFlitTests(argCount, argList);
+  return flit::runFlitTests(argCount, argList);
 }
diff --git a/scripts/flitcli/data/tests/Empty.cpp b/data/tests/Empty.cpp
similarity index 55%
rename from scripts/flitcli/data/tests/Empty.cpp
rename to data/tests/Empty.cpp
index 186d846c..713bb1bd 100644
--- a/scripts/flitcli/data/tests/Empty.cpp
+++ b/data/tests/Empty.cpp
@@ -1,18 +1,17 @@
-#include "flit.h"
+#include <flit.h>
 
 #include <string>
 
 template <typename T>
 GLOBAL
-void Empty_kernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results) {
+void Empty_kernel(const flit::CuTestInput<T>* tiList, double* results) {
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
   auto idx = 0;
 #endif
   auto& ti = tiList[idx];
-  results[idx].s1 = ti.vals[0];
-  results[idx].s2 = 0.0;
+  results[idx] = ti.vals[0];
 }
 
 /** An example test class to show how to make FLiT tests
@@ -30,7 +29,7 @@ class Empty : public flit::TestBase<T> {
    * Can be zero.  If it is zero, then getDefaultInput should return an empty
    * TestInput object which is as simple as "return {};"
    */
-  virtual size_t getInputsPerRun() { return 1; }
+  virtual size_t getInputsPerRun() override { return 1; }
 
   /** Specify the default inputs for your test.
    *
@@ -41,12 +40,44 @@ class Empty : public flit::TestBase<T> {
    * If your algorithm takes no inputs, then you can simply return an empty
    * TestInput object.  It is as simple as "return {};".
    */
-  flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { 1.0 };
     return ti;
   }
 
+  /** Custom comparison methods
+   *
+   * These comparison operations are meant to create a metric between the test
+   * results from this test in the current compilation, and the results from
+   * the ground truth compilation.  You can do things like the relative error
+   * or the absolute error (for the case of long double).
+   *
+   * The below specified functions are the default implementations defined in
+   * the base class.  It is safe to delete these two functions if this
+   * implementation is adequate for you.
+   *
+   * Which one is used depends on the type of Variant that is returned from the
+   * run_impl function.  The value returned by compare will be the value stored
+   * in the database for later analysis.
+   *
+   * Note: when using the CUDA kernel functionality, only long double return
+   * values are valid for now.
+   */
+  virtual long double compare(long double ground_truth,
+                              long double test_results) const override {
+    // absolute error
+    return test_results - ground_truth;
+  }
+
+  /** There is no good default implementation comparing two strings */
+  virtual long double compare(const std::string &ground_truth,
+                              const std::string &test_results) const override {
+    FLIT_UNUSED(ground_truth);
+    FLIT_UNUSED(test_results);
+    return 0.0;
+  }
+
 protected:
   /** Return a kernel pointer to the CUDA kernel equivalent of run_impl
    *
@@ -60,7 +91,7 @@ class Empty : public flit::TestBase<T> {
    * See the documentation above Empty_kernel() for details about what the
    * kernel is expected to have.
    */
-  virtual flit::KernelFunction<T>* getKernel() { return Empty_kernel; }
+  virtual flit::KernelFunction<T>* getKernel() override { return Empty_kernel; }
 
   /** Call or implement the algorithm here.
    *
@@ -71,8 +102,8 @@ class Empty : public flit::TestBase<T> {
    * You are guarenteed that ti will have exactly getInputsPerRun() inputs in
    * it.  If getInputsPerRun() returns zero, then ti.vals will be empty.
    */
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
-    return {std::pair<long double, long double>(ti.vals[0], 0.0), 0};
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
+    return ti.vals[0];
   }
 
 protected:
diff --git a/documentation/available-compiler-flags.md b/documentation/available-compiler-flags.md
index 00783836..9556f47f 100644
--- a/documentation/available-compiler-flags.md
+++ b/documentation/available-compiler-flags.md
@@ -17,8 +17,8 @@ In your configuration file `flit-config.toml` (see [FLiT Configuration
 File](flit-configuration-file.md)), you specify compiler flags for each of the
 compilers.  Only one compiler flag will be used with one optimization level.
 If you want to have specific flag combinations, you can place it in the list,
-such as `"-mavx2 -mfma -ffastmath"`.  Below is the original default list for
-the supported compilers:
+such as `"-mavx2 -mfma -funsafe-math-optimizations"`.  Below is the original
+default list for the supported compilers:
 
 | Flag                          |  GCC  | Clang | Intel | NVCC |
 | ----------------------------- |:-----:|:-----:|:-----:|:----:|
diff --git a/gensrc/testcase.py b/gensrc/testcase.py
index cee1d918..b4772de4 100644
--- a/gensrc/testcase.py
+++ b/gensrc/testcase.py
@@ -9,21 +9,20 @@
 # - default_input: populate ti.vals vector.
 # - vars_initialize: initialize scope variable for the test using ti.vals
 # - cu_vars_initialize: initialize scope variables for the test in CUDA using tiList[idx].vals
-# - func_body: test body that is shared between cuda and non-cuda.  Populate score1 and score2
+# - func_body: test body that is shared between cuda and non-cuda.  Populate score
 template_string = '''
 #include "flit.h"
 
 template <typename T>
 GLOBAL
 void
-{name}Kernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results) {{
+{name}Kernel(const flit::CuTestInput<T>* tiList, double* results) {{
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
   auto idx = 0;
 #endif
-  decltype(results->s1) score1 = 0.0;
-  decltype(results->s2) score2 = 0.0;
+  double score = 0.0;
 
   {cu_vars_initialize}
 
@@ -31,8 +30,7 @@
     {func_body}
   }}
 
-  results[idx].s1 = score1;
-  results[idx].s2 = score2;
+  results[idx] = score;
 }}
 
 template <typename T>
@@ -41,8 +39,8 @@ class {name} : public flit::TestBase<T> {{
   {name}(std::string id)
     : flit::TestBase<T>(std::move(id)) {{}}
 
-  virtual size_t getInputsPerRun() {{ return {input_count}; }}
-  virtual flit::TestInput<T> getDefaultInput() {{
+  virtual size_t getInputsPerRun() override {{ return {input_count}; }}
+  virtual flit::TestInput<T> getDefaultInput() override {{
     flit::TestInput<T> ti;
 
     {default_input}
@@ -51,14 +49,13 @@ class {name} : public flit::TestBase<T> {{
   }}
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() {{
+  virtual flit::KernelFunction<T>* getKernel() override {{
     return {name}Kernel;
   }}
 
   virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {{
-    T score1 = 0.0;
-    T score2 = 0.0;
+  flit::Variant run_impl(const flit::TestInput<T>& ti) override {{
+    T score = 0.0;
 
     flit::info_stream << id << ": Starting test with parameters" << std::endl;
     for (T val : ti.vals) {{
@@ -71,9 +68,9 @@ class {name} : public flit::TestBase<T> {{
 
     {func_body}
 
-    flit::info_stream << id << ": Ending test with values (" << score1 << ", " << score2 << ")" << std::endl;
+    flit::info_stream << id << ": Ending test with value (" << score << ")" << std::endl;
 
-    return {{std::pair<long double, long double>(score1, score2), 0}};
+    return score;
   }}
 
 protected:
@@ -104,8 +101,6 @@ def __init__(self, name, default_input_vals):
 
     # Create an environment for the function body
     env = Environment({
-      #'score1': Variable('score1', 'T'),
-      #'score2': Variable('score2', 'T'),
       })
     var_list = [Variable('in_{0}'.format(i+1), 'T') for i in range(self.input_count)]
     env.update(zip([x.name for x in var_list], var_list))
@@ -116,8 +111,7 @@ def __init__(self, name, default_input_vals):
       var = Variable('e{0}'.format(i+1), 'T')
       self.func_body_lines.append('{0} {1} = {2};'.format(var.type, var.name, random_expression(env, 3)))
       env[var.name] = var
-    self.func_body_lines.append('score1 = {0};'.format(random_expression(env, 4, vars_only=True)))
-    self.func_body_lines.append('score2 = {0};'.format(random_expression(env, 4, vars_only=True)))
+    self.func_body_lines.append('score = {0};'.format(random_expression(env, 4, vars_only=True)))
 
   def write(self, directory='.'):
     '''
diff --git a/inputGen/main.cpp b/inputGen/main.cpp
index 65f7c019..4d5ae58b 100644
--- a/inputGen/main.cpp
+++ b/inputGen/main.cpp
@@ -1,9 +1,8 @@
 #include "helper.h"
 #include "groundtruth.h"
-#include "TestBase.hpp"
-//#include "testbed.h"
 
-#include <cmath>
+#include <flit.h>
+
 #include <fstream>
 #include <iostream>
 #include <list>
@@ -13,6 +12,8 @@
 #include <type_traits>
 #include <typeinfo>
 
+#include <cmath>
+
 #include <dlfcn.h>    // For dlopen(), dlsym() and dlclose()
 
 //TESTRUN_DEFINE(distribution, 3, RandomFloatType::Positive)
diff --git a/litmus-tests/disabled/SimpleCHull.cpp b/litmus-tests/disabled/SimpleCHull.cpp
index a77af2ac..82ae6580 100644
--- a/litmus-tests/disabled/SimpleCHull.cpp
+++ b/litmus-tests/disabled/SimpleCHull.cpp
@@ -17,15 +17,13 @@ class SimpleCHull: public flit::TestBase<T> {
 protected:
   virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) {
     FLIT_UNUSED(ti);
     CHullEdges.clear();
     PointList.clear();
     ReadInputs(fopen("data/random_input", "r"));
     SimpleComputeConvexhull<T>();
-    return {std::pair<long double, long double>((long double)
-						getEdgeCount(), 0.0), 0};
+    return getEdgeCount();
   }
 
 protected:
diff --git a/litmus-tests/tests/DistributivityOfMultiplication.cpp b/litmus-tests/tests/DistributivityOfMultiplication.cpp
index 3a83c539..2c0a3e0d 100644
--- a/litmus-tests/tests/DistributivityOfMultiplication.cpp
+++ b/litmus-tests/tests/DistributivityOfMultiplication.cpp
@@ -13,7 +13,7 @@
 template <typename T>
 GLOBAL
 void
-DistOfMultKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DistOfMultKernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -24,10 +24,7 @@ DistOfMultKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* re
   T c = tiList[idx].vals[2];
 
   auto distributed = (a * c) + (b * c);
-  auto undistributed = (a + b) * c;
-  results[idx].s1 = distributed;
-  results[idx].s2 = undistributed;
-
+  results[idx] = distributed;
 }
 
 template <typename T>
@@ -36,32 +33,28 @@ class DistributivityOfMultiplication : public flit::TestBase<T> {
   DistributivityOfMultiplication(std::string id)
     : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput();
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override;
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() {
+  virtual flit::KernelFunction<T>* getKernel() override {
     return DistOfMultKernel;
   }
 
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     T a = ti.vals[0];
     T b = ti.vals[1];
     T c = ti.vals[2];
 
     auto distributed = (a * c) + (b * c);
-    auto undistributed = (a + b) * c;
 
     flit::info_stream << std::setw(8);
     flit::info_stream << id << ": (a,b,c) = (" << a << ","
                 << b << "," << c << ")" << std::endl;
     flit::info_stream << id << ": dist    = "
                 << distributed << std::endl;
-    flit::info_stream << id << ": undist  = "
-                << undistributed << std::endl;
 
-    return {std::pair<long double, long double>(distributed, undistributed), 0};
+    return distributed;
   }
 
 protected:
diff --git a/litmus-tests/tests/DoHariGSBasic.cpp b/litmus-tests/tests/DoHariGSBasic.cpp
index 6494363b..ac546d65 100644
--- a/litmus-tests/tests/DoHariGSBasic.cpp
+++ b/litmus-tests/tests/DoHariGSBasic.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 GLOBAL
 void
-DoHGSBTestKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* result){
+DoHGSBTestKernel(const flit::CuTestInput<T>* tiList, double* result){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -30,8 +30,7 @@ DoHGSBTestKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* re
 
   double score = std::abs(o12) + std::abs(o13) + std::abs(o23);
 
-  result[idx].s1 = score;
-  result[idx].s2 = 0;
+  result[idx] = score;
 }
 
 template <typename T>
@@ -39,13 +38,13 @@ class DoHariGSBasic: public flit::TestBase<T> {
 public:
   DoHariGSBasic(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 9; }
-  virtual flit::TestInput<T> getDefaultInput();
+  virtual size_t getInputsPerRun() override { return 9; }
+  virtual flit::TestInput<T> getDefaultInput() override;
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return DoHGSBTestKernel; } 
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return DoHGSBTestKernel; } 
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     using flit::operator<<;
 
     long double score = 0.0;
@@ -83,7 +82,7 @@ class DoHariGSBasic: public flit::TestBase<T> {
                               << flit::as_int(score) << std::endl;
       flit::info_stream << id << ": score (dec):  " << score << std::endl;
     }
-    return {std::pair<long double, long double>(score, 0.0), 0};
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/DoHariGSImproved.cpp b/litmus-tests/tests/DoHariGSImproved.cpp
index 30280684..59831214 100644
--- a/litmus-tests/tests/DoHariGSImproved.cpp
+++ b/litmus-tests/tests/DoHariGSImproved.cpp
@@ -6,7 +6,7 @@
 template <typename T>
 GLOBAL
 void
-DoHGSITestKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DoHGSITestKernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -28,8 +28,7 @@ DoHGSITestKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* re
 
   double score = std::abs(o12) + std::abs(o13) + std::abs(o23);
 
-  results[idx].s1 = score;
-  results[idx].s2 = 0;
+  results[idx] = score;
 }
 
 template <typename T>
@@ -37,13 +36,12 @@ class DoHariGSImproved: public flit::TestBase<T> {
 public:
   DoHariGSImproved(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 9; }
-  virtual flit::TestInput<T> getDefaultInput();
+  virtual size_t getInputsPerRun() override { return 9; }
+  virtual flit::TestInput<T> getDefaultInput() override;
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return DoHGSITestKernel; }
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return DoHGSITestKernel; }
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     long double score = 0.0;
 
     //matrix = {a, b, c};
@@ -70,7 +68,7 @@ class DoHariGSImproved: public flit::TestBase<T> {
       flit::info_stream << id << ":   r3: " << r3 << std::endl;
       flit::info_stream << id << ": w dot prods: " << o12 << ", " << o13 << ", " << o23 << std::endl;
     }
-    return {std::pair<long double, long double>(score, 0.0l), 0l};
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/DoMatrixMultSanity.cpp b/litmus-tests/tests/DoMatrixMultSanity.cpp
index 66b76930..3c1f14d2 100644
--- a/litmus-tests/tests/DoMatrixMultSanity.cpp
+++ b/litmus-tests/tests/DoMatrixMultSanity.cpp
@@ -9,7 +9,7 @@
 template <typename T>
 GLOBAL
 void
-DoMatrixMultSanityKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DoMatrixMultSanityKernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -18,8 +18,7 @@ DoMatrixMultSanityKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultEle
   auto ti = tiList[idx];
   auto b = flit::VectorCU<T>(ti.vals, ti.length);
   auto c = flit::MatrixCU<T>::Identity(ti.length) * b;
-  results[idx].s1 = c.L1Distance(b);
-  results[idx].s2 = c.LInfDistance(b);
+  results[idx] = c.L1Distance(b);
 }
 
 template <typename T>
@@ -27,9 +26,9 @@ class DoMatrixMultSanity: public flit::TestBase<T> {
 public:
   DoMatrixMultSanity(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 16; }
+  virtual size_t getInputsPerRun() override { return 16; }
 
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.highestDim = getInputsPerRun();
     ti.min = -6;
@@ -40,16 +39,16 @@ class DoMatrixMultSanity: public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return DoMatrixMultSanityKernel; }
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return DoMatrixMultSanityKernel; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto dim = ti.vals.size();
     flit::Vector<T> b(ti.vals);
     auto c = flit::Matrix<T>::Identity(dim) * b;
     bool eq = (c == b);
     flit::info_stream << id << ": Product is: " << c << std::endl;
     flit::info_stream << id << ": A * b == b? " << eq << std::endl;
-    return {std::pair<long double, long double>(c.L1Distance(b), c.LInfDistance(b)), 0};
+    return c.L1Distance(b);
   }
 
 protected:
diff --git a/litmus-tests/tests/DoOrthoPerturbTest.cpp b/litmus-tests/tests/DoOrthoPerturbTest.cpp
index b7a6a55e..36cd7c75 100644
--- a/litmus-tests/tests/DoOrthoPerturbTest.cpp
+++ b/litmus-tests/tests/DoOrthoPerturbTest.cpp
@@ -8,7 +8,7 @@
 template <typename T>
 GLOBAL
 void
-DoOPTKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DoOPTKernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -50,8 +50,7 @@ DoOPTKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results
     }
     p = backup;
   }
-  results[idx].s1 = score;
-  results[idx].s2 = 0;
+  results[idx] = score;
 }
 
 template <typename T>
@@ -59,8 +58,8 @@ class DoOrthoPerturbTest : public flit::TestBase<T> {
 public:
   DoOrthoPerturbTest(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 16; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 16; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.iters = 200;
     ti.ulp_inc = 1;
@@ -74,9 +73,9 @@ class DoOrthoPerturbTest : public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return DoOPTKernel; }
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return DoOPTKernel; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     using flit::operator<<;
 
     auto iters = ti.iters;
@@ -142,7 +141,7 @@ class DoOrthoPerturbTest : public flit::TestBase<T> {
         << std::endl;
       cdim++;
     }
-    return {std::pair<long double, long double>(score, 0.0), 0};
+    return score;
   }
 
 private:
diff --git a/litmus-tests/tests/DoSimpleRotate90.cpp b/litmus-tests/tests/DoSimpleRotate90.cpp
index 043104ae..25b81a8d 100644
--- a/litmus-tests/tests/DoSimpleRotate90.cpp
+++ b/litmus-tests/tests/DoSimpleRotate90.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 GLOBAL
 void
-DoSR90Kernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DoSR90Kernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -21,8 +21,7 @@ DoSR90Kernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* result
 
   auto done = A.rotateAboutZ_3d(M_PI/2);
   
-  results[idx].s1 = done.L1Distance(expected);
-  results[idx].s2 = done.LInfDistance(expected);
+  results[idx] = done.L1Distance(expected);
 }
 
 template <typename T>
@@ -30,16 +29,17 @@ class DoSimpleRotate90: public flit::TestBase<T> {
 public:
   DoSimpleRotate90(std::string id):flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { 1, 1, 1 };
     return ti;
   }
-  virtual flit::KernelFunction<T>* getKernel() { return DoSR90Kernel; }
 
 protected:
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return DoSR90Kernel; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     flit::Vector<T> A(ti.vals);
     flit::Vector<T> expected = {-A[1], A[0], A[2]};
     flit::info_stream << "Rotating A: " << A << ", 1/2 PI radians" << std::endl;
@@ -47,9 +47,7 @@ class DoSimpleRotate90: public flit::TestBase<T> {
     flit::info_stream << "Resulting vector: " << A << std::endl;
     flit::info_stream << "in " << id << std::endl;
     A.dumpDistanceMetrics(expected, flit::info_stream);
-    return {std::pair<long double, long double>(A.L1Distance(expected),
-						A.LInfDistance(expected)),
-	0};
+    return A.L1Distance(expected);
   }
 
 protected:
diff --git a/litmus-tests/tests/DoSkewSymCPRotationTest.cpp b/litmus-tests/tests/DoSkewSymCPRotationTest.cpp
index 524bd2f7..b70d40be 100644
--- a/litmus-tests/tests/DoSkewSymCPRotationTest.cpp
+++ b/litmus-tests/tests/DoSkewSymCPRotationTest.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 GLOBAL
 void
-DoSkewSCPRKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+DoSkewSCPRKernel(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -23,8 +23,7 @@ DoSkewSCPRKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* re
   auto rMatrix = flit::MatrixCU<T>::Identity(3) +
     sscpm + (sscpm * sscpm) * ((1 - cos)/(sine * sine));
   auto result = rMatrix * A;
-  results[idx].s1 = result.L1Distance(B);
-  results[idx].s1 = result.LInfDistance(B);
+  results[idx] = result.L1Distance(B);
 }
 
 template <typename T>
@@ -33,8 +32,8 @@ class DoSkewSymCPRotationTest: public flit::TestBase<T> {
   DoSkewSymCPRotationTest(std::string id)
     : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 6; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 6; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.min = -6;
     ti.max = 6;
@@ -45,13 +44,11 @@ class DoSkewSymCPRotationTest: public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return DoSkewSCPRKernel;}
+  virtual flit::KernelFunction<T>* getKernel() override { return DoSkewSCPRKernel;}
 
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     flit::info_stream << "entered " << id << std::endl;
     long double L1Score = 0.0;
-    long double LIScore = 0.0;
     flit::Vector<T> A = { ti.vals[0], ti.vals[1], ti.vals[2] };
     flit::Vector<T> B = { ti.vals[3], ti.vals[4], ti.vals[5] };
     A = A.getUnitVector();
@@ -76,7 +73,6 @@ class DoSkewSymCPRotationTest: public flit::TestBase<T> {
     flit::info_stream << "rotator: " << std::endl << rMatrix << std::endl;
     if(!(result == B)){
       L1Score = result.L1Distance(B);
-      LIScore = result.LInfDistance(B);
       flit::info_stream << "Skew symmetric cross product rotation failed with ";
       flit::info_stream << "L1Distance " << L1Score << std::endl;
       flit::info_stream << "starting vectors: " << std::endl;
@@ -85,9 +81,8 @@ class DoSkewSymCPRotationTest: public flit::TestBase<T> {
       flit::info_stream << B << std::endl;
       flit::info_stream << "ended up with: " << std::endl;
       flit::info_stream << "L1Distance: " << L1Score << std::endl;
-      flit::info_stream << "LIDistance: " << LIScore << std::endl;
     }
-    return {std::pair<long double, long double>(L1Score, LIScore), 0};
+    return L1Score;
   }
 
 private:
diff --git a/litmus-tests/tests/FMACancel.cpp b/litmus-tests/tests/FMACancel.cpp
index afc48363..72b95804 100644
--- a/litmus-tests/tests/FMACancel.cpp
+++ b/litmus-tests/tests/FMACancel.cpp
@@ -9,27 +9,24 @@ class FMACancel : public flit::TestBase<T> {
 public:
   FMACancel(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 2; }
+  virtual size_t getInputsPerRun() override { return 2; }
 
-  flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { .1, 1.1e5 };
     return ti;
   }
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     const T a = ti.vals[0];
     const T b = ti.vals[1];
     const T c = a;
     const T d = -b;
 
     const T score = a*b + c*d;
-    const T rtemp = c*d;
-    const T score2 = a*b + rtemp;
     flit::info_stream << id << ": score  = " << score  << std::endl;
-    flit::info_stream << id << ": score2 = " << score2 << std::endl;
-    return {std::pair<long double, long double>(score, score2), 0};
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/InliningProblem.cpp b/litmus-tests/tests/InliningProblem.cpp
index e8644eea..77d582f3 100644
--- a/litmus-tests/tests/InliningProblem.cpp
+++ b/litmus-tests/tests/InliningProblem.cpp
@@ -9,9 +9,9 @@ class InliningProblem : public flit::TestBase<T> {
 public:
   InliningProblem(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 1; }
+  virtual size_t getInputsPerRun() override { return 1; }
 
-  flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { .1, 1.1e3, -.1, -1.1e3, 1/3 };
     return ti;
@@ -23,16 +23,15 @@ class InliningProblem : public flit::TestBase<T> {
     const T x_again = -nx;
     return x_again;
   }
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     T a = ti.vals[0];
     T also_a = identity(a);
 
     const T score = std::sqrt(a) * std::sqrt(also_a);
-    const T score2 = std::pow(std::sqrt(a), 2);
 
     flit::info_stream << id << ": score  = " << score  << std::endl;
-    flit::info_stream << id << ": score2 = " << score2 << std::endl;
-    return {std::pair<long double, long double>(score, score2), 0};
+
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/KahanSum.cpp b/litmus-tests/tests/KahanSum.cpp
index 1afc3c31..3c3ecbbc 100644
--- a/litmus-tests/tests/KahanSum.cpp
+++ b/litmus-tests/tests/KahanSum.cpp
@@ -15,11 +15,11 @@ class KahanSum : public flit::TestBase<T> {
 public:
   KahanSum(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 10000; }
-  virtual flit::TestInput<T> getDefaultInput();
+  virtual size_t getInputsPerRun() override { return 10000; }
+  virtual flit::TestInput<T> getDefaultInput() override;
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     Kahan<T> kahan;
     Shewchuk<T> chuk;
     T naive = 0.0;
@@ -28,13 +28,15 @@ class KahanSum : public flit::TestBase<T> {
       kahan.add(val);
       naive += val;
     }
+    T kahan_sum = kahan.sum();
+    T shewchuk_sum = chuk.sum();
     flit::info_stream << id << ": pi           = " << static_cast<T>(PI) << std::endl;
     flit::info_stream << id << ": exp(1)       = " << static_cast<T>(EXP) << std::endl;
     flit::info_stream << id << ": naive sum    = " << naive << std::endl;
-    flit::info_stream << id << ": kahan sum    = " << kahan.sum() << std::endl;
-    flit::info_stream << id << ": shewchuk sum = " << kahan.sum() << std::endl;
+    flit::info_stream << id << ": kahan sum    = " << kahan_sum << std::endl;
+    flit::info_stream << id << ": shewchuk sum = " << shewchuk_sum << std::endl;
     flit::info_stream << id << ": Epsilon      = " << std::numeric_limits<T>::epsilon() << std::endl;
-    return {std::pair<long double, long double>(kahan.sum(), naive), 0};
+    return kahan_sum;
   }
 
 protected:
diff --git a/litmus-tests/tests/Paranoia.cpp b/litmus-tests/tests/Paranoia.cpp
index 946f6540..3b5c2b9e 100644
--- a/litmus-tests/tests/Paranoia.cpp
+++ b/litmus-tests/tests/Paranoia.cpp
@@ -208,11 +208,11 @@ class Paranoia : public flit::TestBase<F> {
 public:
   Paranoia(std::string id) : flit::TestBase<F>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<F> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<F> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<F>& ti);
+  virtual flit::Variant run_impl(const flit::TestInput<F>& ti) override;
 
   void   setTimeout(long millis);  // starts the timer for checkTimeout()
   void   checkTimeout();          // throws TimeoutError if timer from setTimeout has expired
@@ -320,7 +320,7 @@ void sigfpe(int i)
 }
 
 template <typename F>
-flit::ResultType::mapped_type Paranoia<F>::run_impl(const flit::TestInput<F>& ti)
+flit::Variant Paranoia<F>::run_impl(const flit::TestInput<F>& ti)
 {
   FLIT_UNUSED(ti);
   int timeoutMillis = 1000;
@@ -1867,9 +1867,7 @@ flit::ResultType::mapped_type Paranoia<F>::run_impl(const flit::TestInput<F>& ti
     status = ExitStatus::OverflowStatus;
   }
 
-  return {std::pair<long double, long double>(Milestone,
-					      static_cast<long double>(status)),
-      0};
+  return Milestone;
 }
 
 /* setTimeout */
diff --git a/litmus-tests/tests/ReciprocalMath.cpp b/litmus-tests/tests/ReciprocalMath.cpp
index 635b5645..3687dd44 100644
--- a/litmus-tests/tests/ReciprocalMath.cpp
+++ b/litmus-tests/tests/ReciprocalMath.cpp
@@ -9,16 +9,16 @@ class ReciprocalMath : public flit::TestBase<T> {
 public:
   ReciprocalMath(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 5; }
+  virtual size_t getInputsPerRun() override { return 5; }
 
-  flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { .1, 1.1e3, -.1, -1.1e3, 1/3 };
     return ti;
   }
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     T a = ti.vals[0];
     T b = ti.vals[1];
     T c = ti.vals[2];
@@ -30,12 +30,11 @@ class ReciprocalMath : public flit::TestBase<T> {
     c = c/m;
     d = d/m;
 
-    const T score = a + c;
-    const T score2 = b + d;
+    const T score = a + b + c + d;
 
     flit::info_stream << id << ": score  = " << score  << std::endl;
-    flit::info_stream << id << ": score2 = " << score2 << std::endl;
-    return {std::pair<long double, long double>(score, score2), 0};
+
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/RotateAndUnrotate.cpp b/litmus-tests/tests/RotateAndUnrotate.cpp
index 80916b07..86987324 100644
--- a/litmus-tests/tests/RotateAndUnrotate.cpp
+++ b/litmus-tests/tests/RotateAndUnrotate.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 GLOBAL
 void
-RaUKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+RaUKern(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -19,8 +19,7 @@ RaUKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
   auto orig = A;
   A = A.rotateAboutZ_3d(theta);
   A = A.rotateAboutZ_3d(-theta);
-  results[idx].s1 = A.L1Distance(orig);
-  results[idx].s2 = A.LInfDistance(orig);
+  results[idx] = A.L1Distance(orig);
 }
 
 template <typename T>
@@ -28,8 +27,8 @@ class RotateAndUnrotate: public flit::TestBase<T> {
 public:
   RotateAndUnrotate(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.min = -6;
     ti.max = 6;
@@ -38,9 +37,9 @@ class RotateAndUnrotate: public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return RaUKern; }
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override { return RaUKern; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto theta = M_PI;
     auto A = flit::Vector<T>(ti.vals);
     auto orig = A;
@@ -58,7 +57,7 @@ class RotateAndUnrotate: public flit::TestBase<T> {
     }
     flit::info_stream << "in " << id << std::endl;
     A.dumpDistanceMetrics(orig, flit::info_stream);
-    return {std::pair<long double, long double>(dist, A.LInfDistance(orig)), 0};
+    return dist;
   }
 
 protected:
diff --git a/litmus-tests/tests/RotateFullCircle.cpp b/litmus-tests/tests/RotateFullCircle.cpp
index a5abd6ce..4361d063 100644
--- a/litmus-tests/tests/RotateFullCircle.cpp
+++ b/litmus-tests/tests/RotateFullCircle.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 GLOBAL
 void
-RFCKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+RFCKern(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -21,8 +21,7 @@ RFCKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
   for(decltype(n) r = 0; r < n; ++r){
     A = A.rotateAboutZ_3d(theta);
   }
-  results[idx].s1 = A.L1Distance(orig);
-  results[idx].s2 = A.LInfDistance(orig);
+  results[idx] = A.L1Distance(orig);
 }
 
 template <typename T>
@@ -30,8 +29,8 @@ class RotateFullCircle: public flit::TestBase<T> {
 public:
   RotateFullCircle(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.min = -6;
     ti.max = 6;
@@ -43,8 +42,9 @@ class RotateFullCircle: public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() {return RFCKern; }
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override {return RFCKern; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto n = ti.iters;
     flit::Vector<T> A = flit::Vector<T>(ti.vals);
     auto orig = A;
@@ -62,8 +62,7 @@ class RotateFullCircle: public flit::TestBase<T> {
     }
     flit::info_stream << "in " << id << std::endl;
     A.dumpDistanceMetrics(orig, flit::info_stream);
-    return {std::pair<long double, long double>(A.L1Distance(orig),
-						A.LInfDistance(orig)), 0};
+    return A.L1Distance(orig);
   }
 
 private:
diff --git a/litmus-tests/tests/ShewchukSum.cpp b/litmus-tests/tests/ShewchukSum.cpp
index 31a86b69..009fce60 100644
--- a/litmus-tests/tests/ShewchukSum.cpp
+++ b/litmus-tests/tests/ShewchukSum.cpp
@@ -11,22 +11,16 @@ class ShewchukSum : public flit::TestBase<T> {
 public:
   ShewchukSum(std::string id) : flit::TestBase<T>(std::move(id)) {}
   
-  virtual size_t getInputsPerRun() { return 1000; }
-  virtual flit::TestInput<T> getDefaultInput();
+  virtual size_t getInputsPerRun() override { return 1000; }
+  virtual flit::TestInput<T> getDefaultInput() override;
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     Shewchuk<T> chuk;
     T naive = 0.0;
     for (auto val : ti.vals) {
       chuk.add(val);
       naive += val;
-      //flit::info_stream
-      //  << std::setw(7)
-      //  << std::setprecision(7)
-      //  << id << ": + " << val
-      //  << " = " << chuk.sum() << " or " << naive
-      //  << std::endl;
 			flit::info_stream
 				<< id << ":   partials now: (" << chuk.partials().size() << ") ";
       for (auto p : chuk.partials()) {
@@ -35,10 +29,10 @@ class ShewchukSum : public flit::TestBase<T> {
       flit::info_stream << std::endl;
     }
     T sum = chuk.sum();
-    flit::info_stream << id << ": naive sum    = " << naive << std::endl;
-    flit::info_stream << id << ": shewchuk sum = " << sum << std::endl;
+    flit::info_stream << id << ": naive sum         = " << naive << std::endl;
+    flit::info_stream << id << ": shewchuk sum      = " << sum << std::endl;
     flit::info_stream << id << ": shewchuk partials = " << chuk.partials().size() << std::endl;
-    return {std::pair<long double, long double>(sum, chuk.sum2()), 0};
+    return sum;
   }
 
 protected:
diff --git a/litmus-tests/tests/SinInt.cpp b/litmus-tests/tests/SinInt.cpp
index ba18efee..f12a9acd 100644
--- a/litmus-tests/tests/SinInt.cpp
+++ b/litmus-tests/tests/SinInt.cpp
@@ -11,9 +11,9 @@ class SinInt : public flit::TestBase<T> {
 public:
   SinInt(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 1; }
+  virtual size_t getInputsPerRun() override { return 1; }
 
-  flit::TestInput<T> getDefaultInput() {
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     const T pi = 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998L;
     ti.vals = { pi };
@@ -21,14 +21,13 @@ class SinInt : public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     const int zero = (rand() % 10) / 99;
     const T val = ti.vals[0];
     const T score = std::sin(val + zero) / std::sin(val);
-    const T score2 = score - 1.0;
-    flit::info_stream << id << ": score  = " << score  << std::endl;
-    flit::info_stream << id << ": score2 = " << score2 << std::endl;
-    return {std::pair<long double, long double>(score, score2), 0};
+    flit::info_stream << id << ": score       = " << score       << std::endl;
+    flit::info_stream << id << ": score - 1.0 = " << score - 1.0 << std::endl;
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/TrianglePHeron.cpp b/litmus-tests/tests/TrianglePHeron.cpp
index 61512c90..a0246ec5 100644
--- a/litmus-tests/tests/TrianglePHeron.cpp
+++ b/litmus-tests/tests/TrianglePHeron.cpp
@@ -24,7 +24,7 @@ T getArea(const T a,
 template <typename T>
 GLOBAL
 void
-TrianglePHKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results) {
+TrianglePHKern(const flit::CuTestInput<T>* tiList, double* results) {
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -48,8 +48,7 @@ TrianglePHKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* resu
     auto crit = getCArea(a,b,c);
     score += std::abs(crit - checkVal);
   }
-  results[idx].s1 = score;
-  results[idx].s2 = 0.0;
+  results[idx] = score;
 }
 
 template <typename T>
@@ -57,8 +56,8 @@ class TrianglePHeron: public flit::TestBase<T> {
 public:
   TrianglePHeron(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.iters = 200;
     ti.vals = { 6.0 };
@@ -66,10 +65,9 @@ class TrianglePHeron: public flit::TestBase<T> {
   }
 
 protected:
-  virtual
-  flit::KernelFunction<T>* getKernel() {return TrianglePHKern; }
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override {return TrianglePHKern; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     T maxval = ti.vals[0];
     // start as a right triangle
     T a = maxval;
@@ -77,7 +75,6 @@ class TrianglePHeron: public flit::TestBase<T> {
     T c = maxval * std::sqrt(2);
     const T delta = maxval / (T)ti.iters;
 
-
     // 1/2 b*h = A
     // all perturbations will have the same base and height (plus some FP noise)
     const T checkVal = 0.5 * b * a;
@@ -92,7 +89,7 @@ class TrianglePHeron: public flit::TestBase<T> {
       auto crit = getArea(a,b,c);
       score += std::abs(crit - checkVal);
     }
-    return {std::pair<long double, long double>(score, 0.0), 0};
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/TrianglePSylv.cpp b/litmus-tests/tests/TrianglePSylv.cpp
index 7e8bef04..40fb51a7 100644
--- a/litmus-tests/tests/TrianglePSylv.cpp
+++ b/litmus-tests/tests/TrianglePSylv.cpp
@@ -22,7 +22,7 @@ T getArea(const T a,
 template <typename T>
 GLOBAL
 void
-TrianglePSKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+TrianglePSKern(const flit::CuTestInput<T>* tiList, double* results){
 #ifdef __CUDA__
   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 #else
@@ -46,8 +46,7 @@ TrianglePSKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* resu
     auto crit = getCArea(a,b,c);
     score += std::abs(crit - checkVal);
   }
-  results[idx].s1 = score;
-  results[idx].s2 = 0.0;
+  results[idx] = score;
 }
 
 template <typename T>
@@ -55,8 +54,8 @@ class TrianglePSylv: public flit::TestBase<T> {
 public:
   TrianglePSylv(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.iters = 200;
     ti.vals = { 6.0 };
@@ -64,10 +63,9 @@ class TrianglePSylv: public flit::TestBase<T> {
   }
 
 protected:
-  virtual
-  flit::KernelFunction<T>* getKernel() {return TrianglePSKern; }
-  virtual
-  flit::ResultType::mapped_type run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::KernelFunction<T>* getKernel() override {return TrianglePSKern; }
+
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     T maxval = ti.vals[0];
     // start as a right triangle
     T a = maxval;
@@ -90,7 +88,7 @@ class TrianglePSylv: public flit::TestBase<T> {
       auto crit = getArea(a,b,c);
       score += std::abs(crit - checkVal);
     }
-    return {std::pair<long double, long double>(score, 0.0), 0};
+    return score;
   }
 
 protected:
diff --git a/litmus-tests/tests/langois.cpp b/litmus-tests/tests/langois.cpp
index eea104df..54bd748c 100644
--- a/litmus-tests/tests/langois.cpp
+++ b/litmus-tests/tests/langois.cpp
@@ -14,7 +14,7 @@
 // template <typename T>
 // GLOBAL
 // void
-// addNameHere(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// addNameHere(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
@@ -38,8 +38,7 @@
 //     auto crit = getCArea(a,b,c);
 //     score += std::abs(crit - checkVal);
 //   }
-//   results[idx].s1 = score;
-//   results[idx].s2 = 0.0;
+//   results[idx] = score;
 // }
 
 //these are the helpers for the langois compensating algos
@@ -80,14 +79,13 @@ class langDotFMA: public flit::TestBase<T> {
 public:
   langDotFMA(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<T> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<T> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     FLIT_UNUSED(ti);
     using stype = typename std::vector<T>::size_type;
     stype size = 16;
@@ -101,7 +99,7 @@ class langDotFMA: public flit::TestBase<T> {
     for(stype i = 1; i < size; ++i){
       s[i] = std::fma(x[i], y[i], s[i-1]);
     }
-    return {std::pair<long double, long double>(s[size-1], (T)0.0), 0};
+    return s[size-1];
   }
 
 protected:
@@ -116,14 +114,13 @@ class langCompDotFMA: public flit::TestBase<T> {
 public:
   langCompDotFMA(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<T> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<T> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     FLIT_UNUSED(ti);
     using stype = typename std::vector<T>::size_type;
     stype size = 16;
@@ -140,7 +137,7 @@ class langCompDotFMA: public flit::TestBase<T> {
       ThreeFMA(x[i], y[i], s[i-1], s[i], a, B);
       c[i] = c[i-1] + (a + B);
     }
-    return {std::pair<long double, long double>(s[size-1] + c[size-1], (T)0.0), 0};
+    return s[size-1] + c[size-1];
   }
 
 protected:
@@ -155,14 +152,13 @@ class langCompDot: public flit::TestBase<T> {
 public:
   langCompDot(std::string id) : flit::TestBase<T>(std::move(id)) {}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<T> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<T> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     FLIT_UNUSED(ti);
     using stype = typename std::vector<T>::size_type;
     stype size = 16;
@@ -180,7 +176,7 @@ class langCompDot: public flit::TestBase<T> {
       TwoSum(p, s[i-1], s[i], si);
       c[i] = c[i-1] + (pi + si);
     }
-    return {std::pair<long double, long double>(s[size-1] + c[size-1], (T)0.0), 0};
+    return s[size-1] + c[size-1];
   }
 
 protected:
diff --git a/litmus-tests/tests/tinys.cpp b/litmus-tests/tests/tinys.cpp
index fa482a3b..86432060 100644
--- a/litmus-tests/tests/tinys.cpp
+++ b/litmus-tests/tests/tinys.cpp
@@ -10,14 +10,13 @@
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -25,18 +24,17 @@ class FtoDecToF: public flit::TestBase<T> {
 public:
   FtoDecToF(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
   flit::TestInput<T> ti;
     ti.vals = {std::nextafter(T(0.0), T(1.0))};
     return ti;
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     std::numeric_limits<T> nlim;
     // from https://en.wikipedia.org/wiki/IEEE_floating_point
     uint16_t ddigs = nlim.digits * std::log10(2) + 1;
@@ -46,7 +44,7 @@ class FtoDecToF: public flit::TestBase<T> {
     dstr = res.str();
     T backAgain;
     std::istringstream(dstr) >> backAgain;
-    return{std::pair<long double, long double>(std::fabs((long double)ti.vals[0] - backAgain), 0.0), 0};
+    return ti.vals[0] - backAgain;
   }
 
   using flit::TestBase<T>::id;
@@ -57,14 +55,13 @@ REGISTER_TYPE(FtoDecToF)
 // template <typename T>
 // GLOBAL
 // void
-// subnormalKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// subnormalKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -72,20 +69,17 @@ class subnormal: public flit::TestBase<T> {
 public:
   subnormal(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {std::nextafter(T(0.0), T(1.0))};
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
-    return {
-      std::pair<long double, long double>(ti.vals[0] - ti.vals[0] / 2, 0.0), 0
-    };
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
+    return ti.vals[0] - ti.vals[0] / 2;
   }
   using flit::TestBase<T>::id;
 };
@@ -95,14 +89,13 @@ REGISTER_TYPE(subnormal)
 // template <typename T>
 // GLOBAL
 // void
-// dotProdKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// dotProdKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -110,14 +103,13 @@ class dotProd: public flit::TestBase<T> {
 public:
   dotProd(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<T> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<T> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     FLIT_UNUSED(ti);
     auto size = 16;
 
@@ -127,8 +119,10 @@ class dotProd: public flit::TestBase<T> {
 			    rand.begin() + size));
     flit::Vector<T> B(std::vector<T>(rand.begin() + size,
 			    rand.begin() + 2*size));
-    return {std::pair<long double, long double>(A ^ B, 0.0), 0};
+    return A ^ B;
   }
+
+protected:
   using flit::TestBase<T>::id;
 };
 REGISTER_TYPE(dotProd)
@@ -136,14 +130,13 @@ REGISTER_TYPE(dotProd)
 // template <typename T>
 // GLOBAL
 // void
-// simpleReductionKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// simpleReductionKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -151,14 +144,13 @@ class simpleReduction: public flit::TestBase<T> {
 public:
   simpleReduction(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual flit::TestInput<T> getDefaultInput() { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual flit::TestInput<T> getDefaultInput() override { return {}; }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     FLIT_UNUSED(ti);
     auto vals = flit::getRandSeq<T>();
     auto sublen = vals.size() / 4 - 1;
@@ -172,7 +164,7 @@ class simpleReduction: public flit::TestBase<T> {
     for(uint32_t i = sublen; i < vals.size(); ++i){
       sum += vals[i];
     }
-    return {std::pair<long double, long double>((long double) sum, 0.0), 0};
+    return sum;
   }
   using flit::TestBase<T>::id;
 };
@@ -183,14 +175,13 @@ REGISTER_TYPE(simpleReduction)
 // template <typename T>
 // GLOBAL
 // void
-// addTOLKernel(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// addTOLKernel(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -198,8 +189,8 @@ class addTOL : public flit::TestBase<T> {
 public:
   addTOL(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     std::numeric_limits<T> nls;
     auto man_bits = nls.digits;
@@ -229,13 +220,13 @@ class addTOL : public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] + ti.vals[1] + ti.vals[2];
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
+
   using flit::TestBase<T>::id;
 };
 
@@ -246,14 +237,13 @@ REGISTER_TYPE(addTOL)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -261,22 +251,21 @@ class addSub: public flit::TestBase<T> {
 public:
   addSub(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {T(1.0)};
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     std::numeric_limits<T> nls;
     auto man_bits = nls.digits;
     auto big = std::pow(2, (T)man_bits - 1);
     auto res = (ti.vals[0] + big) - big;
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -285,14 +274,13 @@ REGISTER_TYPE(addSub)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -300,8 +288,8 @@ class divc: public flit::TestBase<T> {
 public:
   divc(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 2; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 2; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -311,12 +299,11 @@ class divc: public flit::TestBase<T> {
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] / ti.vals[1];
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -324,14 +311,13 @@ REGISTER_TYPE(divc)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -339,20 +325,19 @@ class zeroMinusX: public flit::TestBase<T> {
 public:
   zeroMinusX(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = T(0.0) - ti.vals[0];
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -361,14 +346,13 @@ REGISTER_TYPE(zeroMinusX)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -376,20 +360,19 @@ class xMinusZero: public flit::TestBase<T> {
 public:
   xMinusZero(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] - (T)0.0;
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -398,14 +381,13 @@ REGISTER_TYPE(xMinusZero)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -413,19 +395,18 @@ class zeroDivX: public flit::TestBase<T> {
 public:
   zeroDivX(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = (T)0.0 / ti.vals[0];
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -434,14 +415,13 @@ REGISTER_TYPE(zeroDivX)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -449,19 +429,18 @@ class xDivOne: public flit::TestBase<T> {
 public:
   xDivOne(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] / (T)1.0;
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -470,14 +449,13 @@ REGISTER_TYPE(xDivOne)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -485,19 +463,18 @@ class xDivNegOne: public flit::TestBase<T> {
 public:
   xDivNegOne(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] / (T)-1.0;
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -506,14 +483,13 @@ REGISTER_TYPE(xDivNegOne)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -521,8 +497,8 @@ class negAdivB: public flit::TestBase<T> {
 public:
   negAdivB(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 2; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 2; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -531,12 +507,11 @@ class negAdivB: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = -(ti.vals[0] / ti.vals[1]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -545,14 +520,13 @@ REGISTER_TYPE(negAdivB)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 // template <typename T>
@@ -569,11 +543,10 @@ REGISTER_TYPE(negAdivB)
 // protected:
 //   virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
 // 
-//   virtual flit::ResultType::mapped_type
-//   run_impl(const flit::TestInput<T>& ti) {
+//   virtual flit::Variant run_impl(const flit::TestInput<T>& ti) {
 //     //yes, this is ugly.  ti.vals s/b vector of floats
 //     auto res = (T)((std::result_of<::get_next_type(T)>::type)ti.vals[0]);
-//     return {res, 0.0};
+//     return res;
 //   }
 //   using flit::TestBase<T>::id;
 // };
@@ -583,14 +556,13 @@ REGISTER_TYPE(negAdivB)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -598,8 +570,8 @@ class negAminB: public flit::TestBase<T> {
 public:
   negAminB(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 2; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 2; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -608,12 +580,11 @@ class negAminB: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = -(ti.vals[0] - ti.vals[1]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -623,14 +594,13 @@ REGISTER_TYPE(negAminB)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -638,20 +608,19 @@ class xMinusX: public flit::TestBase<T> {
 public:
   xMinusX(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 1; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 1; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = { flit::getRandSeq<T>()[0] };
     return ti;
   }
 
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] - ti.vals[0];
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -661,14 +630,13 @@ REGISTER_TYPE(xMinusX)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -676,8 +644,8 @@ class negAplusB: public flit::TestBase<T> {
 public:
   negAplusB(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 2; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 2; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -686,12 +654,11 @@ class negAplusB: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = -(ti.vals[0] + ti.vals[1]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -701,14 +668,13 @@ REGISTER_TYPE(negAplusB)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -716,8 +682,8 @@ class aXbDivC: public flit::TestBase<T> {
 public:
   aXbDivC(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -727,12 +693,11 @@ class aXbDivC: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] * (ti.vals[1] / ti.vals[2]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -742,14 +707,13 @@ REGISTER_TYPE(aXbDivC)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -757,8 +721,8 @@ class aXbXc: public flit::TestBase<T> {
 public:
   aXbXc(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -768,12 +732,11 @@ class aXbXc: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] * (ti.vals[1] * ti.vals[2]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -783,14 +746,13 @@ REGISTER_TYPE(aXbXc)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -798,8 +760,8 @@ class aPbPc: public flit::TestBase<T> {
 public:
   aPbPc(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput() {
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -809,12 +771,11 @@ class aPbPc: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
     auto res = ti.vals[0] + (ti.vals[1] + ti.vals[2]);
-    return {std::pair<long double, long double>(res, 0.0), 0};
+    return res;
   }
   using flit::TestBase<T>::id;
 };
@@ -824,14 +785,13 @@ REGISTER_TYPE(aPbPc)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -839,8 +799,8 @@ class xPc1EqC2: public flit::TestBase<T> {
 public:
   xPc1EqC2(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -850,12 +810,11 @@ class xPc1EqC2: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
-    auto res = ti.vals[0] + ti.vals[1] == ti.vals[2];
-    return {std::pair<long double, long double>(res?1.0:0.0, 0.0), 0};
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
+    bool res = ti.vals[0] + ti.vals[1] == ti.vals[2];
+    return res ? 1.0 : 0.0;
   }
   using flit::TestBase<T>::id;
 };
@@ -865,14 +824,13 @@ REGISTER_TYPE(xPc1EqC2)
 // template <typename T>
 // GLOBAL
 // void
-// FtoDecToFKern(const flit::CuTestInput<T>* tiList, flit::CudaResultElement* results){
+// FtoDecToFKern(const flit::CuTestInput<T>* tiList, double* results){
 // #ifdef __CUDA__
 //   auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 // #else
 //   auto idx = 0;
 // #endif
-//   results[idx].s1 = ;
-//   results[idx].s2 = ;
+//   results[idx] = 0.0;
 // }
 
 template <typename T>
@@ -880,8 +838,8 @@ class xPc1NeqC2: public flit::TestBase<T> {
 public:
   xPc1NeqC2(std::string id) : flit::TestBase<T>(std::move(id)){}
 
-  virtual size_t getInputsPerRun() { return 3; }
-  virtual flit::TestInput<T> getDefaultInput(){
+  virtual size_t getInputsPerRun() override { return 3; }
+  virtual flit::TestInput<T> getDefaultInput() override {
     flit::TestInput<T> ti;
     ti.vals = {
       flit::getRandSeq<T>()[0],
@@ -891,12 +849,11 @@ class xPc1NeqC2: public flit::TestBase<T> {
     return ti;
   }
 protected:
-  virtual flit::KernelFunction<T>* getKernel() { return nullptr; }
+  virtual flit::KernelFunction<T>* getKernel() override { return nullptr; }
 
-  virtual flit::ResultType::mapped_type
-  run_impl(const flit::TestInput<T>& ti) {
-    auto res = ti.vals[0] + ti.vals[1] != ti.vals[2];
-    return {std::pair<long double, long double>(res?1.0:0.0, 0.0), 0};
+  virtual flit::Variant run_impl(const flit::TestInput<T>& ti) override {
+    bool res = ti.vals[0] + ti.vals[1] != ti.vals[2];
+    return res ? 1.0 : 0.0;
   }
   using flit::TestBase<T>::id;
 };
diff --git a/scripts/flitcli/config/flit-default.toml.in b/scripts/flitcli/config/flit-default.toml.in
index f933061a..ed5b5b4d 100644
--- a/scripts/flitcli/config/flit-default.toml.in
+++ b/scripts/flitcli/config/flit-default.toml.in
@@ -1,9 +1,12 @@
 [database]
 
-username = 'mbentley'
-address = 'localhost'
-type = 'postgres'
-port = 5432
+# older versions of flit supported postgres.  that has been removed.  only
+# sqlite is supported at the moment.
+type = 'sqlite'
+
+# if relative path, it is relative to the directory containing this
+# configuration file.
+filepath = 'results.sqlite'
 
 [[hosts]]
 
@@ -13,15 +16,23 @@ name = '{hostname}'
 flit_path = '{flit_path}'
 config_dir = '{config_dir}'
 
+# The settings for "make dev"
+[hosts.dev_build]
+# compiler_name must be found in [[hosts.compilers]] list under name attribute
+# but the optimization level and switches do not need to be in the compiler list
+compiler_name = 'g++'
+optimization_level = '-O2'
+switches = '-funsafe-math-optimizations'
+
 # The ground truth compilation to use in analysis
 [hosts.ground_truth]
 # compiler needs to match the name field for one of this host's hosts.compilers
 # The optimization level needs to be part of the list of optimization_levels
 # and the switch needs to be a string of compiler flags from the switches list.
 # TODO: make use of hosts.ground_truth
-compiler = 'g++'
+compiler_name = 'g++'
 optimization_level = '-O0'
-switch = ''
+switches = ''
 
   [[hosts.compilers]]
 
@@ -48,7 +59,7 @@ switch = ''
     #'-Ofast',
     #'-O...' ?
   ]
-  switches = [
+  switches_list = [
     '',
     '-fassociative-math',
     '-mavx',
diff --git a/scripts/flitcli/flit_import.py b/scripts/flitcli/flit_import.py
new file mode 100644
index 00000000..09f2613d
--- /dev/null
+++ b/scripts/flitcli/flit_import.py
@@ -0,0 +1,159 @@
+'Implements the import subcommand, importing results into a database'
+
+import flitutil as util
+
+import toml
+
+import argparse
+import csv
+import datetime
+import os
+import sqlite3
+import sys
+
+brief_description = 'Import flit results into the configured database'
+
+def _file_check(filename):
+    if not os.path.isfile(filename):
+        raise argparse.ArgumentTypeError('File does not exist: {0}'.format(filename))
+    return filename
+
+def main(arguments, prog=sys.argv[0]):
+    parser = argparse.ArgumentParser(
+            prog=prog,
+            description='''
+                Import flit results into the configured database.  The
+                configured database is found from the settings in
+                flit-config.toml.  You can import either exported results or
+                results from manually running the tests.  Note that importing
+                the same thing twice will result in having two copies of it
+                in the database.
+                ''',
+            )
+    parser.add_argument('importfile', nargs='+', type=_file_check,
+                        help='''
+                            File(s) to import into the database.  These files
+                            may be csv files or sqlite3 databases.
+                            ''')
+    #parser.add_argument('-t', '--table', default='tests',
+    #                    help='''
+    #                        The database table used for import (default is tests)
+    #                        ''')
+    parser.add_argument('-r', '--run', type=int, default=-1,
+                        help='''
+                            The run number to import under.  If the run does
+                            not exist in the runs table, then a new entry will
+                            be created with an autogenerated message.  The
+                            default behavior is to use the latest run in the
+                            database (but one will be created if there is no
+                            runs).  If importing from an sqlite database, the
+                            run column of the tests table is ignored since
+                            there is no necessary correlation between run
+                            numbers of that database and this database.  So you
+                            would still want to use the --run option.  For the
+                            sqlite database case, the results imported will
+                            only be from the latest run in the importing
+                            database.
+                            ''')
+    parser.add_argument('--new-run', action='store_true',
+                        help='''
+                            Specifies that this import should be under a new
+                            run number that will be autogenerated.  This option
+                            conflicts with the --run option, meaning if this
+                            argument is specified, then the --run argument will
+                            be ignored.  This option is also implied if the
+                            destination database has no runs in it.
+                            ''')
+    args = parser.parse_args(arguments)
+
+    try:
+        projconf = toml.load('flit-config.toml')
+    except FileNotFoundError:
+        print('Error: {0} not found.  Run "flit init"'.format(tomlfile),
+              file=sys.stderr)
+        return 1
+
+    assert projconf['database']['type'] == 'sqlite', \
+            'Only sqlite database supported'
+    db = util.sqlite_open(projconf['database']['filepath'])
+
+    # if the database has no runs, then turn on --new-run
+    run_ids = [x['id'] for x in db.execute('select id from runs')]
+    if len(run_ids) == 0:
+        args.new_run = True
+    print('run_ids: ', run_ids)
+
+    # Find the destination run
+    if not args.new_run:
+        assert args.run <= 0 or args.run in run_ids, \
+                'Specified run {0} is not in the runs table'.format(args.run)
+        if args.run not in run_ids:
+            args.run = sorted(run_ids)[-1]
+    else: # args.new_run
+        # Create a new run to use in import
+        db.execute('insert into runs(rdate,notes) values (?,?)',
+                (datetime.datetime.now(), 'Imported using flit import'))
+        db.commit()
+        args.run = db.execute('select id from runs order by id').fetchall()[-1]['id']
+
+    for importee in args.importfile:
+        print(importee)
+        if util.is_sqlite(importee):
+            # Try to treat the importfile like a sqlite database
+            import_db = util.sqlite_open(importee)
+            cur = import_db.cursor()
+            cur.execute('select id from runs')
+            run_ids = sorted([x['id'] for x in cur])
+            if len(run_ids) == 0:
+                print('  nothing to import')
+                continue
+            latest_run = run_ids[-1]
+            cur.execute('select name,host,compiler,optl,switches,precision,'
+                        'comparison,comparison_d,file,nanosec '
+                        'from tests where run = ?', (latest_run,))
+            rows = cur.fetchall()
+        else:
+            with open(importee, 'r') as csvin:
+                reader = csv.DictReader(csvin)
+                rows = [row for row in reader]
+        if len(rows) == 0:
+            print('  nothing to import')
+            continue
+        to_insert = []
+        for row in rows:
+            # Convert 'NULL' to None
+            for key, val in row.items():
+                row[key] = val if val != 'NULL' else None
+            # Insert
+            to_insert.append((
+                args.run,
+                row['name'],
+                row['host'],
+                row['compiler'],
+                row['optl'],
+                row['switches'],
+                row['precision'],
+                row['comparison'],
+                row['comparison_d'],
+                row['file'],
+                row['nanosec'],
+                ))
+        db.executemany('''
+            insert into tests(
+                run,
+                name,
+                host,
+                compiler,
+                optl,
+                switches,
+                precision,
+                comparison,
+                comparison_d,
+                file,
+                nanosec)
+            values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', to_insert)
+    db.commit()
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/flitcli/flit_init.py b/scripts/flitcli/flit_init.py
index 48f390fd..d4549524 100644
--- a/scripts/flitcli/flit_init.py
+++ b/scripts/flitcli/flit_init.py
@@ -57,7 +57,7 @@ def main(arguments, prog=sys.argv[0]):
     # Add litmus tests too
     if args.litmus_tests:
         for srcfile in os.listdir(conf.litmus_test_dir):
-            if os.path.splitext(srcfile)[1] in ('.cpp', '.hpp', '.h'):
+            if os.path.splitext(srcfile)[1] in ('.cpp', '.h'):
                 srcpath = os.path.join(conf.litmus_test_dir, srcfile)
                 to_copy[os.path.join('tests', srcfile)] = srcpath
 
diff --git a/scripts/flitcli/flit_run.py b/scripts/flitcli/flit_run.py
index 4ec2d5dd..9f4a8d9a 100644
--- a/scripts/flitcli/flit_run.py
+++ b/scripts/flitcli/flit_run.py
@@ -1,4 +1,4 @@
-'Implements the squelch subcommand'
+'Implements the run subcommand for executing all compilations of flit tests'
 
 import argparse
 import sys
@@ -15,8 +15,8 @@ def main(arguments, prog=sys.argv[0]):
                 should be sent to the database computer for later analysis.
                 ''',
             )
-    parser.add_argument('directory', default='.',
-                        help='The directory to initialize')
+    parser.add_argument('description',
+                        help='A description of the test run (required)')
     args = parser.parse_args(arguments)
 
     # Subcommand logic here
diff --git a/scripts/flitcli/flit_update.py b/scripts/flitcli/flit_update.py
index d0ed8f0d..53e324b0 100644
--- a/scripts/flitcli/flit_update.py
+++ b/scripts/flitcli/flit_update.py
@@ -39,15 +39,46 @@ def main(arguments, prog=sys.argv[0]):
     else:
         print('Creating {0}'.format(makefile))
 
-    compiler = projconf['hosts'][0]['compilers'][0]['binary']
-    if '/' in compiler:
-        compiler = os.path.realpath(compiler)
+    host = projconf['hosts'][0]
+    dev_build = host['dev_build']
+    dev_compiler_name = dev_build['compiler_name']
+    dev_optl = dev_build['optimization_level']
+    dev_switches = dev_build['switches']
+    matching_dev_compilers = [x for x in host['compilers']
+                              if x['name'] == dev_compiler_name]
+    assert len(matching_dev_compilers) > 0, \
+            'Compiler name {0} not found'.format(dev_compiler_name)
+    assert len(matching_dev_compilers) < 2, \
+            'Multiple compilers with name {0} found'.format(dev_compiler_name)
+    dev_compiler_bin = matching_dev_compilers[0]['binary']
+    if '/' in dev_compiler_bin:
+        dev_compiler_bin = os.path.realpath(dev_compiler_bin)
+
+    ground_truth = host['ground_truth']
+    gt_compiler_name = ground_truth['compiler_name']
+    gt_optl = ground_truth['optimization_level']
+    gt_switches = ground_truth['switches']
+    matching_gt_compilers = [x for x in host['compilers']
+                             if x['name'] == gt_compiler_name]
+    assert len(matching_dev_compilers) > 0, \
+            'Compiler name {0} not found'.format(gt_compiler_name)
+    assert len(matching_dev_compilers) < 2, \
+            'Multiple compilers with name {0} found'.format(gt_compiler_names)
+    # TODO: use the compiler mnemonic rather than the path
+    gt_compiler_bin = matching_gt_compilers[0]['binary']
+    if '/' in dev_compiler_bin:
+        gt_compiler_bin = os.path.realpath(gt_compiler_name)
 
     flitutil.process_in_file(
         os.path.join(conf.data_dir, 'Makefile.in'),
         makefile,
         {
-            'compiler': compiler,
+            'dev_compiler': dev_compiler_bin,
+            'dev_optl': dev_optl,
+            'dev_switches': dev_switches,
+            'ground_truth_compiler': gt_compiler_bin,
+            'ground_truth_optl': gt_optl,
+            'ground_truth_switches': gt_switches,
             'flit_include_dir': conf.include_dir,
             'flit_lib_dir': conf.lib_dir,
             'flit_script': os.path.join(conf.script_dir, 'flit.py'),
diff --git a/scripts/flitcli/flitconfig.py b/scripts/flitcli/flitconfig.py
index 4106c1e4..7f8aa6ea 100644
--- a/scripts/flitcli/flitconfig.py
+++ b/scripts/flitcli/flitconfig.py
@@ -21,20 +21,20 @@
 script_dir = os.path.dirname(os.path.realpath(__file__))
 
 # flit documentation
-doc_dir = os.path.realpath(os.path.join(script_dir, '../../documentation'))
+doc_dir = os.path.realpath(os.path.join(script_dir, '..', '..', 'documentation'))
 
 # compiled libflit.so
-lib_dir = os.path.realpath(os.path.join(script_dir, '../../lib'))
+lib_dir = os.path.realpath(os.path.join(script_dir, '..', '..', 'lib'))
 
 # flit C++ include files, primarily flit.h
-include_dir = os.path.realpath(os.path.join(script_dir, '../../src'))
+include_dir = os.path.realpath(os.path.join(script_dir, '..', '..', 'src'))
 
 # default configuration for flit init
 config_dir = os.path.join(script_dir, 'config')
 
 # default data files such as Makefiles
-data_dir = os.path.join(script_dir, 'data')
+data_dir = os.path.join(script_dir, '..', '..', 'data')
 
 # directory containing litmus tests
-litmus_test_dir = os.path.realpath(os.path.join(script_dir, '../../litmus-tests/tests'))
+litmus_test_dir = os.path.realpath(os.path.join(script_dir, '..', '..', 'litmus-tests', 'tests'))
 
diff --git a/scripts/flitcli/flitutil.py b/scripts/flitcli/flitutil.py
index 5e008219..32ac972d 100644
--- a/scripts/flitcli/flitutil.py
+++ b/scripts/flitcli/flitutil.py
@@ -2,7 +2,10 @@
 Utility functions shared between multiple flit subcommands.
 '''
 
+import flitconfig as conf
+
 import os
+import sqlite3
 import sys
 
 def process_in_file(infile, dest, vals, overwrite=False):
@@ -25,4 +28,38 @@ def process_in_file(infile, dest, vals, overwrite=False):
         with open(dest, 'w') as fout:
             fout.write(fin.read().format(**vals))
 
+def sqlite_open(filepath):
+    '''
+    Opens and returns an sqlite database cursor object.  If the database does
+    not exist, it will be created.
+    '''
+    # Using detect_types allows us to insert datetime objects
+    connection = sqlite3.connect(filepath,
+                                 detect_types=sqlite3.PARSE_DECLTYPES)
+
+    # Use the dict factory so that queries return dictionary-like objects
+    connection.row_factory = sqlite3.Row
+
+    # Create the tables if they do not exist.  Also has other setup.
+    table_file = os.path.join(conf.data_dir, 'db', 'tables-sqlite.sql')
+    with open(table_file, 'r') as table_sql:
+        connection.executescript(table_sql.read())
+    connection.commit()
+
+    return connection
+
+def is_sqlite(filename):
+    'Returns true if the file is likely an sqlite file.'
+    from os.path import isfile, getsize
+
+    if not os.path.isfile(filename):
+        return False
+
+    # SQLite database file header is 100 bytes
+    if os.path.getsize(filename) < 100:
+        return False
+
+    with open(filename, 'rb') as fd:
+        header = fd.read(100)
 
+    return header[:16] == b'SQLite format 3\000'
diff --git a/src/CUHelpers.cpp b/src/CUHelpers.cpp
index b5c3d674..1b02de5b 100644
--- a/src/CUHelpers.cpp
+++ b/src/CUHelpers.cpp
@@ -1,5 +1,5 @@
-#include "CUHelpers.hpp"
-#include "flitHelpers.hpp"
+#include "CUHelpers.h"
+#include "flitHelpers.h"
 
 namespace flit {
 
diff --git a/src/CUHelpers.hpp b/src/CUHelpers.h
similarity index 99%
rename from src/CUHelpers.hpp
rename to src/CUHelpers.h
index 2e837977..febd37c5 100644
--- a/src/CUHelpers.hpp
+++ b/src/CUHelpers.h
@@ -14,8 +14,8 @@
 #define DEVICE __device__
 #define GLOBAL __global__
 #endif
-#include "flitHelpers.hpp"
-#include "CUVector.hpp"
+#include "flitHelpers.h"
+#include "CUVector.h"
 
 #include <vector>
 
diff --git a/src/CUVector.hpp b/src/CUVector.h
similarity index 99%
rename from src/CUVector.hpp
rename to src/CUVector.h
index e817d208..6dd99c7a 100644
--- a/src/CUVector.hpp
+++ b/src/CUVector.h
@@ -1,7 +1,7 @@
 #ifndef CU_VECTOR_HPP
 #define CU_VECTOR_HPP
 
-#include "CUHelpers.hpp"
+#include "CUHelpers.h"
 
 #include <vector>
 
diff --git a/src/InfoStream.cpp b/src/InfoStream.cpp
index 8fbc4bc4..20eecf50 100644
--- a/src/InfoStream.cpp
+++ b/src/InfoStream.cpp
@@ -1,4 +1,4 @@
-#include "InfoStream.hpp"
+#include "InfoStream.h"
 
 #include <mutex>
 
diff --git a/src/InfoStream.hpp b/src/InfoStream.h
similarity index 100%
rename from src/InfoStream.hpp
rename to src/InfoStream.h
diff --git a/src/TestBase.cpp b/src/TestBase.cpp
index f2c6ca4c..9650b120 100644
--- a/src/TestBase.cpp
+++ b/src/TestBase.cpp
@@ -1,35 +1,21 @@
 //this is the base instantiation for tests
 
-#include "TestBase.hpp"
+#include "TestBase.h"
 
 #include <stack>
 
-// namespace  {
-// double volatile baseD;
-// bool dReg = false;
-// float volatile baseF;
-// bool fReg = false;
-// long double volatile baseL;
-// bool lReg = false;
-// bool watching = false;
-// std::stack<float> fStack;
-// std::stack<double> dStack;
-// std::stack<long double> lStack;
-// } // end of unnamed namespace
 namespace flit {
 
-  //output operator for ResultType
-std::ostream&
-operator<<(std::ostream& os, const ResultType& res){
-  // std::string name = r.first;
-  // std::string prec;
-  // long double s1;
-  // long double s2;
-  // std::tie(prec, s1, s2) = r.second;
-  for(auto r  : res){
-    os << r.first.first << ":" << r.first.second << ","
-       << r.second.first.first << "," << r.second.first.second << r.second.second << std::endl;
-  }
+std::ostream& operator<<(std::ostream& os, const TestResult& res) {
+  std::string comparison =
+    (res.is_comparison_null() ? std::to_string(res.comparison()) : "NULL");
+
+  os << res.name() << ":" << res.precision() << ","
+     << res.result() << ","
+     << comparison << ","
+     << res.nanosecs();
+
   return os;
 }
+
 } // end of namespace flit
diff --git a/src/TestBase.hpp b/src/TestBase.h
similarity index 62%
rename from src/TestBase.hpp
rename to src/TestBase.h
index 609bbfff..329ce070 100644
--- a/src/TestBase.hpp
+++ b/src/TestBase.h
@@ -5,30 +5,70 @@
 #ifndef TEST_BASE_HPP
 #define TEST_BASE_HPP
 
-#include "flitHelpers.hpp"
+#include "flitHelpers.h"
 
 #ifdef __CUDA__
-#include "CUHelpers.hpp"
+#include "CUHelpers.h"
 #endif // __CUDA__
 
+#include "Variant.h"
+
+#include <fstream>
 #include <map>
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include <cassert>
 
 namespace flit {
 
 void setWatching(bool watch = true);
 
+struct TestResult {
+public:
+  TestResult(const std::string &_name, const std::string &_precision,
+             const Variant &_result, int_fast64_t _nanosecs,
+             const std::string &_resultfile = "")
+    : m_name(_name)
+    , m_precision(_precision)
+    , m_result(_result)
+    , m_nanosecs(_nanosecs)
+    , m_resultfile(_resultfile)
+  { }
+
+  // getters
+  std::string name() const { return m_name; }
+  std::string precision() const { return m_precision; }
+  Variant result() const { return m_result; }
+  int_fast64_t nanosecs() const { return m_nanosecs; }
+  long double comparison() const { return m_comparison; }
+  bool is_comparison_null() const { return m_is_comparison_null; }
+  std::string resultfile() const { return m_resultfile; }
+
+  // setters
+  void set_comparison(long double _comparison) {
+    m_comparison = _comparison;
+    m_is_comparison_null = false;
+  }
+  void set_resultfile(const std::string &_resultfile) {
+    m_resultfile = _resultfile;
+  }
 
-using ResultType = std::map<std::pair<const std::string, const std::string>,
-                            std::pair<std::pair<long double, long double>, int_fast64_t>>;
+private:
+  std::string m_name;
+  std::string m_precision;
+  Variant m_result;
+  int_fast64_t m_nanosecs {0};
+  long double m_comparison {0.0L};
+  bool m_is_comparison_null {true};
+  std::string m_resultfile;
+};
 
-std::ostream&
-operator<<(std::ostream&, const ResultType&);
+std::ostream& operator<<(std::ostream& os, const TestResult& res);
 
 template <typename T>
 struct TestInput {
@@ -40,11 +80,6 @@ struct TestInput {
   std::vector<T> vals;
 };
 
-struct CudaResultElement {
-  double s1;
-  double s2;
-};
-
 /** A simple structure used in CUDA tests.
  *
  * It stores some values and a pointer, but does not do dynamic allocation nor
@@ -91,7 +126,7 @@ struct CuTestInput {
  * @param results: array where to store results, already allocated
  */
 template <typename T>
-using KernelFunction = void (const CuTestInput<T>*, CudaResultElement*);
+using KernelFunction = void (const CuTestInput<T>*, double*);
 
 template <typename T>
 using CudaDeleter = void (T*);
@@ -140,7 +175,7 @@ std::unique_ptr<T, CudaDeleter<T>*> makeCudaArr(const T* vals, size_t length) {
  * @param stride: how many inputs per test run
  */
 template <typename T>
-std::vector<ResultType::mapped_type>
+std::vector<double>
 runKernel(KernelFunction<T>* kernel, const TestInput<T>& ti, size_t stride) {
 #ifdef __CUDA__
   size_t runCount;
@@ -157,7 +192,7 @@ runKernel(KernelFunction<T>* kernel, const TestInput<T>& ti, size_t stride) {
     ctiList[i].vals = ti.vals.data() + i * stride;
     ctiList[i].length = stride;
   }
-  std::unique_ptr<CudaResultElement[]> cuResults(new CudaResultElement[runCount]);
+  std::unique_ptr<double[]> cuResults(new double[runCount]);
   // Note: __CPUKERNEL__ mode is broken by the change to run the kernel in
   // multithreaded mode.  Its compilation is broken.
   // TODO: fix __CPUKERNEL__ mode for testing.
@@ -170,25 +205,21 @@ runKernel(KernelFunction<T>* kernel, const TestInput<T>& ti, size_t stride) {
     ctiList[i].vals = deviceVals.get() + i * stride;
   }
   auto deviceInput = makeCudaArr(ctiList.get(), runCount);
-  auto deviceResult = makeCudaArr<CudaResultElement>(nullptr, runCount);
+  auto deviceResult = makeCudaArr<double>(nullptr, runCount);
   kernel<<<runCount,1>>>(deviceInput.get(), deviceResult.get());
-  auto resultSize = sizeof(CudaResultElement) * runCount;
+  auto resultSize = sizeof(double) * runCount;
   checkCudaErrors(cudaMemcpy(cuResults.get(), deviceResult.get(), resultSize,
                              cudaMemcpyDeviceToHost));
 # endif // __CPUKERNEL__
-  std::vector<ResultType::mapped_type> results;
-  for (size_t i = 0; i < runCount; i++) {
-    results.emplace_back(std::pair<long double, long double>
-                         (cuResults[i].s1, cuResults[i].s2), 0);
-  }
+  std::vector<double> results(cuResults, cuResults + runCount);
   return results;
-#else  // not __CUDA__
+#else   // not __CUDA__
   // Do nothing
   FLIT_UNUSED(kernel);
   FLIT_UNUSED(ti);
   FLIT_UNUSED(stride);
   return {};
-#endif // __CUDA__
+#endif  // __CUDA__
 }
 
 template <typename T>
@@ -208,13 +239,14 @@ class TestBase {
    *
    * @see getInputsPerRun
    */
-  virtual ResultType run(const TestInput<T>& ti,
-                         const bool GetTime,
-                         const size_t TimingLoops) {
+  virtual std::vector<TestResult> run(const TestInput<T>& ti,
+                                      const std::string &filebase,
+                                      const bool GetTime,
+                                      const size_t TimingLoops) {
     using std::chrono::high_resolution_clock;
     using std::chrono::duration;
     using std::chrono::duration_cast;
-    ResultType results;
+    std::vector<TestResult> results;
     TestInput<T> emptyInput {
       ti.iters, ti.highestDim, ti.ulp_inc, ti.min, ti.max, {}
     };
@@ -237,78 +269,145 @@ class TestBase {
     }
 
     // Run the tests
-    std::vector<ResultType::mapped_type> scoreList;
+    struct TimedResult {
+      Variant result;
+      int_fast64_t time;
+      std::string resultfile;
+
+      TimedResult(Variant res, int_fast64_t t, const std::string &f = "")
+        : result(res), time(t), resultfile(f) { }
+    };
+    std::vector<TimedResult> resultValues;
 #ifdef __CUDA__
     auto kernel = getKernel();
     if (kernel == nullptr) {
       for (auto runInput : inputSequence) {
+        Variant testResult;
+        int_fast64_t timing = 0;
         if (GetTime) {
-          ResultType::mapped_type scores;
           int_fast64_t nsecs = 0;
           for (int r = 0; r < TimingLoops; ++r) {
             auto s = high_resolution_clock::now();
-            scores = run_impl(runInput);
+            testResult = run_impl(runInput);
             auto e = high_resolution_clock::now();
             nsecs += duration_cast<duration<int_fast64_t, std::nano>>(e-s).count();
             assert(nsecs > 0);
           }
-          scores.second = nsecs / TimingLoops;
-          scoreList.push_back(scores);
+          timing = nsecs / TimingLoops;
         } else {
-          scoreList.push_back(run_impl(runInput));
+          testResult = run_impl(runInput);
+          timing = 0;
         }
+        // Output string results to file since it alone may take up to 300 MB
+        // or more
+        std::string outfile;
+        if (testResult.type() == Variant::Type::String) {
+          outfile = filebase + "_" + id + "_" + typeid(T).name() + ".dat";
+          std::ofstream resultout(outfile);
+          resultout << testResult.string();
+          testResult = Variant(); // empty the result to release memory
+        }
+        resultValues.emplace_back(testResult, timing, outfile);
       }
     } else {
+      int_fast64_t timing = 0;
+      std::vector<double> scoreList;
       if (GetTime) {
-        ResultType::mapped_type scores;
         int_fast64_t nsecs = 0;
         for (size_t r = 0; r < TimingLoops; ++r){
           auto s = high_resolution_clock::now();
+          // TODO: find out how to properly profile CUDA kernels.
+          // FIXME: This strategy of timing is not right because:
+          // FIXME: 1. multiple inputs are tested in parallel
+          // FIXME: 2. timing is done not only over kernel execution, but also
+          // FIXME:    in transfer time
+          // FIXME: 3. stalls in device availability are not accounted for
           scoreList = runKernel(kernel, ti, stride);
           auto e = high_resolution_clock::now();
           nsecs += duration_cast<duration<int_fast64_t, std::nano>>(e-s).count();
           assert(nsecs > 0);
         }
         auto avg = nsecs / TimingLoops;
-        auto avgPerKernel = avg / scoreList.size();
-        for (auto& s : scoreList) {
-          s.second = avgPerKernel;
-        }
+        timing = avg / scoreList.size();
       } else {
         scoreList = runKernel(kernel, ti, stride);
+        timing = 0;
+      }
+      for (auto& testResult : scoreList) {
+        resultValues.emplace_back(testResult, timing);
       }
     }
 #else  // not __CUDA__
     for (auto runInput : inputSequence) {
+      Variant testResult;
+      int_fast64_t timing = 0;
       if (GetTime) {
-        ResultType::mapped_type scores;
         int_fast64_t nsecs = 0;
         for (size_t r = 0; r < TimingLoops; ++r) {
           auto s = high_resolution_clock::now();
-          scores = run_impl(runInput);
+          testResult = run_impl(runInput);
           auto e = high_resolution_clock::now();
           nsecs += duration_cast<duration<int_fast64_t, std::nano>>(e-s).count();
           assert(nsecs > 0);
         }
-        scores.second = nsecs / TimingLoops;
-        scoreList.push_back(scores);
+        timing = nsecs / TimingLoops;
       } else {
-        scoreList.push_back(run_impl(runInput));
+        testResult = run_impl(runInput);
+        timing = 0;
+      }
+      // Output string results to file since it alone may take up to 300 MB
+      // or more
+      std::string outfile;
+      if (testResult.type() == Variant::Type::String) {
+        outfile = filebase + "_" + id + "_" + typeid(T).name() + ".dat";
+        std::ofstream resultout(outfile);
+        resultout << testResult.string();
+        testResult = Variant(); // empty the result to release memory
       }
+      resultValues.emplace_back(testResult, timing, outfile);
     }
 #endif // __CUDA__
 
     // Store and return the test results
-    for (size_t i = 0; i < scoreList.size(); i++) {
+    for (size_t i = 0; i < resultValues.size(); i++) {
       std::string name = id;
-      if (scoreList.size() != 1) {
+      if (resultValues.size() != 1) {
         name += "_idx" + std::to_string(i);
       }
-      results.insert({{name, typeid(T).name()}, scoreList[i]});
+      results.emplace_back(name, typeid(T).name(), resultValues[i].result,
+                           resultValues[i].time, resultValues[i].resultfile);
     }
     return results;
   }
 
+  /** Simply forwards the request to the appropriate overload of compare.
+   *
+   * If the types of the variants do not match, then a std::runtime_error is
+   * thrown.
+   */
+  long double variant_compare(const Variant &ground_truth,
+                              const Variant &test_results) {
+    if (ground_truth.type() != test_results.type()) {
+      throw std::runtime_error("Variants to compare are of different types");
+    }
+    long double val = 0.0;
+    switch (ground_truth.type()) {
+      case Variant::Type::LongDouble:
+        val = this->compare(ground_truth.longDouble(),
+                            test_results.longDouble());
+        break;
+
+      case Variant::Type::String:
+        val = this->compare(ground_truth.string(),
+                            test_results.string());
+        break;
+
+      default:
+        throw std::runtime_error("Unimplemented Variant type");
+    }
+    return val;
+  }
+
   /** This is a set of default inputs to use for the test
    *
    * This function should be implemented such that we can simply call this test
@@ -330,6 +429,38 @@ class TestBase {
    */
   virtual size_t getInputsPerRun() = 0;
 
+  /** Custom comparison methods
+   *
+   * These comparison operations are meant to create a metric between the test
+   * results from this test in the current compilation, and the results from
+   * the ground truth compilation.  You can do things like the relative error
+   * or the absolute error (for the case of long double).
+   *
+   * The below specified functions are the default implementations defined in
+   * the base class.  It is safe to delete these two functions if this
+   * implementation is adequate for you.
+   *
+   * Which one is used depends on the type of Variant that is returned from the
+   * run_impl function.  The value returned by compare will be the value stored
+   * in the database for later analysis.
+   *
+   * Note: when using the CUDA kernel functionality, only long double return
+   * values are valid for now.
+   */
+  virtual long double compare(long double ground_truth,
+                              long double test_results) const {
+    // absolute error
+    return test_results - ground_truth;
+  }
+
+  /** There is no good default implementation comparing two strings */
+  virtual long double compare(const std::string &ground_truth,
+                              const std::string &test_results) const {
+    FLIT_UNUSED(ground_truth);
+    FLIT_UNUSED(test_results);
+    return 0.0;
+  }
+
 protected:
   /** If this test implements a CUDA kernel, return the kernel pointer
    *
@@ -351,27 +482,29 @@ class TestBase {
    *   test inputs required according to the implemented getInputsPerRun().  So
    *   if that function returns 9, then the vector will have exactly 9
    *   elements.
-   * @return a single result.  See ResultType to see what the mapped types is.
+   * @return a single result.  You can return any type supported by flit::Variant.
+   *
+   * The returned value (whichever type is chosen) will be used by the public
+   * virtual compare() method.
    */
-  virtual ResultType::mapped_type run_impl(const TestInput<T>& ti) = 0;
+  virtual Variant run_impl(const TestInput<T>& ti) = 0;
 
 protected:
   const std::string id;
 };
 
-/// A completely empty test that outputs nothing
+/** A completely empty test that outputs nothing */
 template <typename T>
 class NullTest : public TestBase<T> {
 public:
   NullTest(std::string id) : TestBase<T>(std::move(id)) {}
-  virtual TestInput<T> getDefaultInput() { return {}; }
-  virtual size_t getInputsPerRun() { return 0; }
-  virtual ResultType run(const TestInput<T>&,
-                         const bool,
-                         const size_t) { return {}; }
+  virtual TestInput<T> getDefaultInput() override { return {}; }
+  virtual size_t getInputsPerRun() override { return 0; }
+  virtual std::vector<TestResult> run(
+      const TestInput<T>&, const bool, const size_t) override { return {}; }
 protected:
-  virtual KernelFunction<T>* getKernel() { return nullptr; }
-  virtual ResultType::mapped_type run_impl(const TestInput<T>&) { return {}; }
+  virtual KernelFunction<T>* getKernel() override { return nullptr; }
+  virtual Variant run_impl(const TestInput<T>&) override { return {}; }
 };
 
 class TestFactory {
@@ -420,13 +553,13 @@ inline std::shared_ptr<TestBase<long double>> TestFactory::get<long double> () {
 #ifdef __CUDA__
 
 #define REGISTER_TYPE(klass)                                \
-  class klass##Factory : public flit::TestFactory {      \
+  class klass##Factory : public flit::TestFactory {         \
   public:                                                   \
     klass##Factory() {                                      \
-      flit::registerTest(#klass, this);                  \
+      flit::registerTest(#klass, this);                     \
     }                                                       \
   protected:                                                \
-    virtual createType create() {                           \
+    virtual createType create() override {                  \
       return std::make_tuple(                               \
           std::make_shared<klass<float>>(#klass),           \
           std::make_shared<klass<double>>(#klass),          \
@@ -440,13 +573,13 @@ inline std::shared_ptr<TestBase<long double>> TestFactory::get<long double> () {
 #else // not __CUDA__
 
 #define REGISTER_TYPE(klass)                                \
-  class klass##Factory : public flit::TestFactory {      \
+  class klass##Factory : public flit::TestFactory {         \
   public:                                                   \
     klass##Factory() {                                      \
-      flit::registerTest(#klass, this);                  \
+      flit::registerTest(#klass, this);                     \
     }                                                       \
   protected:                                                \
-    virtual createType create() {                           \
+    virtual createType create() override {                  \
       return std::make_tuple(                               \
           std::make_shared<klass<float>>(#klass),           \
           std::make_shared<klass<double>>(#klass),          \
@@ -463,16 +596,6 @@ inline std::map<std::string, TestFactory*>& getTests() {
   return tests;
 }
 
-// template <bool C = hasCuda, typename std::enable_if<!C>::type* = nullptr>
-// static std::map<std::string, TestFactory*>& getTests() {
-// #ifdef __CUDA__
-//   return {};
-// #else
-//   static std::map<std::string, TestFactory*> tests;
-//   return tests;
-// #endif
-// }
-
 inline void registerTest(const std::string& name, TestFactory *factory) {
   getTests()[name] = factory;
 }
diff --git a/src/Variant.cpp b/src/Variant.cpp
new file mode 100644
index 00000000..7e5660af
--- /dev/null
+++ b/src/Variant.cpp
@@ -0,0 +1,32 @@
+#include "Variant.h"
+
+namespace flit {
+
+std::ostream& operator<< (std::ostream& out, const Variant &val) {
+  switch (val.type()) {
+    case Variant::Type::None:
+      out << "Variant(None)";
+      break;
+    case Variant::Type::LongDouble:
+      out << "Variant(" << val.longDouble() << ")";
+      break;
+    case Variant::Type::String:
+      out << "Variant(\"" << val.string() << "\")";
+      break;
+    default:
+      throw std::runtime_error("Unimplemented type");
+  }
+  return out;
+}
+
+template <>
+long double Variant::val() const {
+  return this->longDouble();
+}
+
+template <>
+std::string Variant::val() const {
+  return this->string();
+}
+
+} // end of namespace flit
diff --git a/src/Variant.h b/src/Variant.h
new file mode 100644
index 00000000..d4270cbb
--- /dev/null
+++ b/src/Variant.h
@@ -0,0 +1,71 @@
+#ifndef VARIANT_H
+#define VARIANT_H
+
+#include <ostream>
+#include <stdexcept>
+#include <string>
+
+namespace flit {
+
+/** Can represent various different types
+ *
+ * This class is intented to be able to hold many different types in the
+ * same object so that you can do things like make a list containing
+ * sometimes strings and sometimes integers, etc.
+ */
+class Variant {
+public:
+  enum class Type {
+    None = 1,
+    LongDouble = 2,
+    String = 3,
+  };
+
+  Variant() : _type(Type::None) { }
+
+  Variant(long double val)
+    : _type(Type::LongDouble)
+    , _ld_val(val) { }
+
+  Variant(std::string &val)
+    : _type(Type::String)
+    , _str_val(val) { }
+  Variant(const std::string &val)
+    : _type(Type::String)
+    , _str_val(val) { }
+  Variant(std::string &&val)
+    : _type(Type::String)
+    , _str_val(val) { }
+  Variant(const char* val)
+    : _type(Type::String)
+    , _str_val(val) { }
+
+  Type type() const { return _type; }
+
+  long double longDouble() const {
+    if (_type != Type::LongDouble) {
+      throw std::runtime_error("Variant is not of type Long Double");
+    }
+    return _ld_val;
+  }
+
+  std::string string() const {
+    if (_type != Type::String) {
+      throw std::runtime_error("Variant is not of type String");
+    }
+    return _str_val;
+  }
+
+  template <typename T> T val() const;
+
+private:
+  Type _type;
+  long double _ld_val { 0.0l };
+  std::string _str_val { "" };
+};
+
+std::ostream& operator<< (std::ostream&, const Variant&);
+
+} // end of namespace flit
+
+#endif // VARIANT_H
diff --git a/src/flit.cpp b/src/flit.cpp
index 8d3a1c7f..c223d04c 100644
--- a/src/flit.cpp
+++ b/src/flit.cpp
@@ -8,30 +8,86 @@
 #include <type_traits>
 #include <typeinfo>
 
+#include <cassert>
 #include <cstring>
 
 #include "flit.h"
 
-#include "flitHelpers.hpp"
-#include "TestBase.hpp"
-
-void outputResults(const flit::ResultType& scores, std::ostream& out){
-  using flit::operator<<;
-  using flit::as_int;
-  for(const auto& i: scores){
-    out
-      << "HOST,SWITCHES,OPTL,COMPILER,"
-      << i.first.second << ",us,"             // sort
-      << i.second.first.first << ","          // score0d
-      << as_int(i.second.first.first) << ","  // score0
-      << i.second.first.second << ","         // score1d
-      << as_int(i.second.first.second) << "," // score1
-      << i.first.first << ","                 // name
-      << i.second.second << ","               // nanoseconds
-      << "FILENAME"                           // filename
-      << std::endl;
+#include "flitHelpers.h"
+#include "TestBase.h"
+
+namespace {
+
+/** Helper class for Csv.
+ *
+ * Represents a single row either indexed by number or by column name.
+ */
+class CsvRow : public std::vector<std::string> {
+public:
+  const CsvRow* header() const { return m_header; }
+  void setHeader(CsvRow* head) { m_header = head; }
+
+  using std::vector<std::string>::operator[];
+  std::string const& operator[](std::string col) const {
+    auto iter = std::find(m_header->begin(), m_header->end(), col);
+    if (iter == m_header->end()) {
+      std::stringstream message;
+      message << "No column named " << col;
+      throw std::invalid_argument(message.str());
+    }
+    auto idx = iter - m_header->begin();
+    return this->operator[](idx);
   }
-}
+
+private:
+  CsvRow* m_header {nullptr};  // not owned by this class
+};
+
+/** Class for parsing csv files */
+class Csv {
+public:
+  Csv(std::istream &in) : m_header(Csv::parseRow(in)), m_in(in) {
+    m_header.setHeader(&m_header);
+  }
+
+  Csv& operator>> (CsvRow& row) {
+    row = Csv::parseRow(m_in);
+    row.setHeader(&m_header);
+    return *this;
+  }
+
+  operator bool() const { return static_cast<bool>(m_in); }
+  
+private:
+  static CsvRow parseRow(std::istream &in) {
+    std::string line;
+    std::getline(in, line);
+
+    std::stringstream lineStream(line);
+    std::string token;
+
+    // tokenize on ','
+    CsvRow row;
+    while(std::getline(lineStream, token, ',')) {
+      row.emplace_back(token);
+    }
+
+    // check for trailing comma with no data after it
+    if (!lineStream && token.empty()) {
+      row.emplace_back("");
+    }
+
+    return row;
+  }
+
+private:
+  CsvRow m_header;
+  std::istream &m_in;
+};
+
+} // end of unnamed namespace
+
+namespace flit {
 
 std::string FlitOptions::toString() {
   std::ostringstream messanger;
@@ -42,6 +98,8 @@ std::string FlitOptions::toString() {
     << "  verbose:      " << boolToString(this->verbose) << "\n"
     << "  timing:       " << boolToString(this->timing) << "\n"
     << "  timingLoops:  " << this->timingLoops << "\n"
+    << "  output:       " << this->output << "\n"
+    << "  groundTruth:  " << this->groundTruth << "\n"
     << "  precision:    " << this->precision << "\n"
     << "  tests:\n";
   for (auto& test : this->tests) {
@@ -53,17 +111,18 @@ std::string FlitOptions::toString() {
 FlitOptions parseArguments(int argCount, char* argList[]) {
   FlitOptions options;
 
-  std::vector<std::string> helpOpts      = { "-h", "--help" };
-  std::vector<std::string> verboseOpts   = { "-v", "--verbose" };
-  std::vector<std::string> timingOpts    = { "-t", "--timing" };
-  std::vector<std::string> loopsOpts     = { "-l", "--timing-loops" };
-  std::vector<std::string> listTestsOpts = { "-L", "--list-tests" };
-  std::vector<std::string> precisionOpts = { "-p", "--precision" };
-  std::vector<std::string> outputOpts    = { "-o", "--output" };
+  std::vector<std::string> helpOpts          = { "-h", "--help" };
+  std::vector<std::string> verboseOpts       = { "-v", "--verbose" };
+  std::vector<std::string> timingOpts        = { "-t", "--timing" };
+  std::vector<std::string> loopsOpts         = { "-l", "--timing-loops" };
+  std::vector<std::string> listTestsOpts     = { "-L", "--list-tests" };
+  std::vector<std::string> precisionOpts     = { "-p", "--precision" };
+  std::vector<std::string> outputOpts        = { "-o", "--output" };
+  std::vector<std::string> groundTruthOpts   = { "-g", "--ground-truth" };
   std::vector<std::string> allowedPrecisions = {
     "all", "float", "double", "long double"
   };
-  auto allowedTests = getKeys(flit::getTests());
+  auto allowedTests = getKeys(getTests());
   allowedTests.emplace_back("all");
   for (int i = 1; i < argCount; i++) {
     std::string current(argList[i]);
@@ -99,6 +158,11 @@ FlitOptions parseArguments(int argCount, char* argList[]) {
         throw ParseException(current + " requires an argument");
       }
       options.output = argList[++i];
+    } else if (isIn(groundTruthOpts, current)) {
+      if (i+1 == argCount) {
+        throw ParseException(current + " requires an argument");
+      }
+      options.groundTruth = argList[++i];
     } else {
       options.tests.push_back(current);
       if (!isIn(allowedTests, current)) {
@@ -108,7 +172,7 @@ FlitOptions parseArguments(int argCount, char* argList[]) {
   }
 
   if (options.tests.size() == 0 || isIn(options.tests, std::string("all"))) {
-    options.tests = getKeys(flit::getTests());
+    options.tests = getKeys(getTests());
   }
 
   return options;
@@ -152,6 +216,18 @@ std::string usage(std::string progName) {
        "                  standard output will still go to the terminal.\n"
        "                  The default behavior is to output to stdout.\n"
        "\n"
+       "  -g INFILE, --ground-truth INFILE\n"
+       "                  Use the following results file (usually generated\n"
+       "                  using the --output option with the ground-truth\n"
+       "                  compiled executable).  This option allows the\n"
+       "                  creation of data for the comparison column in the\n"
+       "                  results.  The test's compare() function is used.\n"
+       "\n"
+       "                  Note: for tests outputting string data, the path\n"
+       "                  may be a relative path from where you executed the\n"
+       "                  ground-truth executable, in which case you will\n"
+       "                  want to run this test from that same directory.\n"
+       "\n"
        "  -p PRECISION, --precision PRECISION\n"
        "                  Which precision to run.  The choices are 'float',\n"
        "                  'double', 'long double', and 'all'.  The default\n"
@@ -160,3 +236,48 @@ std::string usage(std::string progName) {
   return messanger.str();
 }
 
+std::string readFile(const std::string &filename) {
+  std::ifstream filein(filename);
+  std::stringstream buffer;
+  buffer << filein.rdbuf();
+  return buffer.str();
+}
+
+std::vector<TestResult> parseResults(std::istream &in) {
+  std::vector<TestResult> results;
+
+  Csv csv(in);
+  CsvRow row;
+  while (csv >> row) {
+    auto nanosec = std::stol(row["nanosec"]);
+    Variant value;
+    std::string resultfile;
+    if (row["score"] != "NULL") {
+      // Convert score into a long double
+      value = as_float(flit::stouint128(row["score"]));
+    } else {
+      // Read string from the resultfile
+      assert(row["resultfile"] != "NULL");
+      resultfile = row["resultfile"];
+    }
+
+    results.emplace_back(row["name"], row["precision"], value, nanosec,
+                         resultfile);
+  }
+
+  return results;
+}
+
+std::string removeIdxFromName(const std::string &name) {
+  std::string pattern("_idx"); // followed by 1 or more digits
+  auto it = std::find_end(name.begin(), name.end(),
+                          pattern.begin(), pattern.end());
+  // assert that after the pattern, all the remaining chars are digits.
+  assert(it == name.end() ||
+         std::all_of(it + pattern.size(), name.end(), [](char c) {
+           return '0' <= c && c <= '9';
+         }));
+  return std::string(name.begin(), it);
+}
+
+} // end of namespace flit
diff --git a/src/flit.h b/src/flit.h
index 32061712..09aedf98 100644
--- a/src/flit.h
+++ b/src/flit.h
@@ -4,12 +4,12 @@
 #ifndef FLIT_H
 #define FLIT_H 0
 
-#include "flitHelpers.hpp"
-#include "TestBase.hpp"
+#include "flitHelpers.h"
+#include "TestBase.h"
 
 #ifdef __CUDA__
 //#include <cuda.h>
-#include "CUHelpers.hpp"
+#include "CUHelpers.h"
 #endif
 
 #include <algorithm>
@@ -23,19 +23,34 @@
 
 #include <cstring>
 
-void outputResults(const flit::ResultType& scores, std::ostream& out);
+// Define macros to use in the output
+// These can be overridden at compile time to insert compile-time information
 
-template <typename F>
-void runTestWithDefaultInput(flit::TestFactory* factory,
-                             flit::ResultType& totScores,
-                             bool shouldTime = true,
-                             int timingLoops = 1) {
-  auto test = factory->get<F>();
-  auto ip = test->getDefaultInput();
-  auto scores = test->run(ip, shouldTime, timingLoops);
-  totScores.insert(scores.begin(), scores.end());
-  flit::info_stream.flushout();
-}
+#ifndef FLIT_HOST
+#define FLIT_HOST "HOST"
+#endif // FLIT_HOST
+
+#ifndef FLIT_COMPILER
+#define FLIT_COMPILER "COMPILER"
+#endif // FLIT_COMPILER
+
+#ifndef FLIT_OPTL
+#define FLIT_OPTL "OPTL"
+#endif // FLIT_OPTL
+
+#ifndef FLIT_SWITCHES
+#define FLIT_SWITCHES "SWITCHES"
+#endif // FLIT_SWITCHES
+
+#ifndef FLIT_NULL
+#define FLIT_NULL "NULL"
+#endif // FLIT_NULL
+
+#ifndef FLIT_FILENAME
+#define FLIT_FILENAME "FILENAME"
+#endif // FLIT_FILENAME
+
+namespace flit {
 
 /** Command-line options */
 struct FlitOptions {
@@ -47,6 +62,7 @@ struct FlitOptions {
   std::string output = "";        // output file for results.  default stdout
   bool timing = true;     // should we run timing?
   int timingLoops = 1;    // < 1 means to auto-determine the timing loops
+  std::string groundTruth = "";   // input for ground-truth comparison
 
   /** Give a string representation of this struct for printing purposes */
   std::string toString();
@@ -57,6 +73,128 @@ struct FlitOptions {
   }
 };
 
+/** Parse arguments */
+FlitOptions parseArguments(int argCount, char* argList[]);
+
+/** Returns the usage information as a string */
+std::string usage(std::string progName);
+
+/** Read file contents entirely into a string */
+std::string readFile(const std::string &filename);
+
+/** Parse the results file into a vector of results */
+std::vector<TestResult> parseResults(std::istream &in);
+
+/** Test names sometimes are postfixed with "_idx" + <num>.  Remove that postfix */
+std::string removeIdxFromName(const std::string &name);
+
+inline void outputResults (const std::vector<TestResult>& results,
+    std::ostream& out)
+{
+  // Output the column headers
+  out << "name,"
+         "host,"
+         "compiler,"
+         "optl,"
+         "switches,"
+         "precision,"
+         "score,"
+         "score_d,"
+         "resultfile,"
+         "comparison,"
+         "comparison_d,"
+         "file,"
+         "nanosec"
+      << std::endl;
+  for(const auto& result: results){
+    out
+      << result.name() << ","                        // test case name
+      << FLIT_HOST << ","                            // hostname
+      << FLIT_COMPILER << ","                        // compiler
+      << FLIT_OPTL << ","                            // optimization level
+      << FLIT_SWITCHES << ","                        // compiler flags
+      << result.precision() << ","                   // precision
+      ;
+
+    if (result.result().type() == Variant::Type::LongDouble) {
+      out
+        << as_int(result.result().longDouble()) << "," // score
+        << result.result().longDouble() << ","       // score_d
+        ;
+    } else {
+      out
+        << FLIT_NULL << ","                          // score
+        << FLIT_NULL << ","                          // score_d
+        ;
+    }
+
+    if (result.resultfile().empty()) {
+      out << FLIT_NULL << ",";                       // resultfile
+    } else {
+      out << result.resultfile() << ",";             // resultfile
+    }
+
+    if (result.is_comparison_null()) {
+      out
+        << FLIT_NULL << ","                          // comparison
+        << FLIT_NULL << ","                          // comparison_d
+        ;
+    } else {
+      out
+        << as_int(result.comparison()) << ","        // comparison
+        << result.comparison() << ","                // comparison_d
+        ;
+    }
+
+    out
+      << FLIT_FILENAME << ","                        // executable filename
+      << result.nanosecs()                           // nanoseconds
+      << std::endl;
+  }
+}
+
+
+template <typename F>
+void runTestWithDefaultInput(TestFactory* factory,
+                             std::vector<TestResult>& totResults,
+                             const std::string &filebase = "",
+                             bool shouldTime = true,
+                             int timingLoops = 1) {
+  auto test = factory->get<F>();
+  auto ip = test->getDefaultInput();
+  auto results = test->run(ip, filebase, shouldTime, timingLoops);
+  totResults.insert(totResults.end(), results.begin(), results.end());
+  info_stream.flushout();
+}
+
+template <typename F>
+long double runComparison_impl(TestFactory* factory, const TestResult &gt,
+                               const TestResult &res) {
+  auto test = factory->get<F>();
+  if (!res.resultfile().empty()) {
+    assert(res.result().type() == Variant::Type::None);
+    assert( gt.result().type() == Variant::Type::None);
+    return test->compare(readFile(gt.resultfile()),
+                         readFile(res.resultfile()));
+  } else if (res.result().type() == Variant::Type::LongDouble) {
+    return test->compare(gt.result().longDouble(), res.result().longDouble());
+  } else { throw std::runtime_error("Unsupported variant type"); }
+}
+
+inline long double runComparison(TestFactory* factory, const TestResult &gt,
+                                 const TestResult &res) {
+  // TODO: after moving to lazy file load, load file contents at comparison
+  if (res.precision() == "f") {
+    return runComparison_impl<float>(factory, gt, res);
+  } else if (res.precision() == "d") {
+    return runComparison_impl<double>(factory, gt, res);
+  } else if (res.precision() == "e") {
+    return runComparison_impl<long double>(factory, gt, res);
+  } else { throw std::runtime_error("Unrecognized precision encountered"); }
+}
+
+    
+    
 /** Returns true if the element is in the container */
 template<typename Container, typename Element>
 bool isIn(Container c, Element e) {
@@ -81,12 +219,6 @@ class ParseException : std::exception {
   const std::string _message;
 };
 
-/** Parse arguments */
-FlitOptions parseArguments(int argCount, char* argList[]);
-
-/** Returns the usage information as a string */
-std::string usage(std::string progName);
-
 inline int runFlitTests(int argc, char* argv[]) {
   // Argument parsing
   FlitOptions options;
@@ -104,53 +236,97 @@ inline int runFlitTests(int argc, char* argv[]) {
   }
 
   if (options.listTests) {
-    for (auto& test : getKeys(flit::getTests())) {
+    for (auto& test : getKeys(getTests())) {
       std::cout << test << std::endl;
     }
     return 0;
   }
 
   if (options.verbose) {
-    flit::info_stream.show();
+    info_stream.show();
   }
 
   std::unique_ptr<std::ostream> stream_deleter;
   std::ostream *outstream = &std::cout;
+  std::string test_result_filebase(FLIT_FILENAME);
   if (!options.output.empty()) {
     stream_deleter.reset(new std::ofstream(options.output.c_str()));
     outstream = stream_deleter.get();
+    test_result_filebase = options.output;
   }
 
   std::cout.precision(1000); //set cout to print many decimal places
-  flit::info_stream.precision(1000);
+  info_stream.precision(1000);
 
 #ifdef __CUDA__
-  flit::initDeviceData();
+  initDeviceData();
 #endif
 
-  flit::ResultType scores;
-  auto testMap = flit::getTests();
+  std::vector<TestResult> results;
+  std::vector<TestResult> groundTruthResults;
+  if (!options.groundTruth.empty()) {
+    std::ifstream gtfile(options.groundTruth);
+    // TODO: only load file contents at time of comparison
+    groundTruthResults = parseResults(gtfile);
+  }
+
+  auto testMap = getTests();
   for (auto& testName : options.tests) {
     auto factory = testMap[testName];
     if (options.precision == "all" || options.precision == "float") {
-      runTestWithDefaultInput<float>(factory, scores, options.timing,
-                                     options.timingLoops);
+      runTestWithDefaultInput<float>(factory, results, test_result_filebase,
+                                     options.timing, options.timingLoops);
     }
     if (options.precision == "all" || options.precision == "double") {
-      runTestWithDefaultInput<double>(factory, scores, options.timing,
-                                      options.timingLoops);
+      runTestWithDefaultInput<double>(factory, results, test_result_filebase,
+                                      options.timing, options.timingLoops);
     }
     if (options.precision == "all" || options.precision == "long double") {
-      runTestWithDefaultInput<long double>(factory, scores, options.timing,
-                                           options.timingLoops);
+      runTestWithDefaultInput<long double>(
+          factory, results, test_result_filebase, options.timing,
+          options.timingLoops);
     }
+    // TODO: dump string result to file because we might run out of memory
   }
 #if defined(__CUDA__) && !defined(__CPUKERNEL__)
   cudaDeviceSynchronize();
 #endif
 
-  outputResults(scores, *outstream);
+  // Sort the results first by name then by precision
+  auto testComparator = [](const TestResult &a, const TestResult &b) {
+    if (a.name() != b.name()) {
+      return a.name() < b.name();
+    } else {
+      return a.precision() < b.precision();
+    }
+  };
+  std::sort(results.begin(), results.end(), testComparator);
+  std::sort(groundTruthResults.begin(), groundTruthResults.end(),
+            testComparator);
+
+  // Let's now run the ground-truth comparisons
+  if (groundTruthResults.size() > 0) {
+    for (auto& res : results) {
+      auto factory = testMap[removeIdxFromName(res.name())];
+      // Use binary search to find the first associated ground truth element
+      auto gtIter = std::lower_bound(groundTruthResults.begin(),
+                                     groundTruthResults.end(), res,
+                                     testComparator);
+      // Compare the two results if the element was found
+      if (gtIter != groundTruthResults.end() &&
+          res.name() == (*gtIter).name() &&
+          res.precision() == (*gtIter).precision())
+      {
+        res.set_comparison(runComparison(factory, *gtIter, res));
+      }
+    }
+  }
+
+  // Create the main results output
+  outputResults(results, *outstream);
   return 0;
 }
 
+} // end of namespace flit
+
 #endif // FLIT_H
diff --git a/src/flitHelpers.cpp b/src/flitHelpers.cpp
index 3e37b7f5..f61d13cb 100644
--- a/src/flitHelpers.cpp
+++ b/src/flitHelpers.cpp
@@ -2,11 +2,13 @@
 // they utilize the watch data for sensitive points
 // of computation.
 
-#include "flitHelpers.hpp"
+#include "flitHelpers.h"
 
 #include <iostream>
 #include <mutex>
 
+#include <cassert>
+
 namespace flit {
 
 const std::vector<uint_fast32_t>
@@ -73,29 +75,52 @@ const std::vector<long double> long_rands = setRandSeq<long double>(RAND_VECT_SI
 
   
 thread_local InfoStream info_stream;
-std::mutex ostreamMutex;
 
 std::ostream& operator<<(std::ostream& os, const unsigned __int128 i){
-  if(i == 0) os << 0;
-  else{
-    std::ostringstream ost;
-    uint64_t hi = i >> 64;
-    uint64_t lo = (uint64_t)i;
-    ostreamMutex.lock();
-    auto bflags = os.flags();
-    os.flags(std::ios::hex & ~std::ios::showbase);
-    ost.flags(std::ios::hex & ~std::ios::showbase);
-    ost << lo;    
-    os << "0x" << hi;
-    for(uint32_t x = 0; x < 16 - ost.str().length(); ++x){
-      os << "0";
-    }
-    os << ost.str();
-    os.flags( bflags );
-    ostreamMutex.unlock();
+  std::ostringstream ost;
+  uint64_t hi = i >> 64;
+  uint64_t lo = (uint64_t)i;
+  auto bflags = os.flags();
+  os.flags(std::ios::hex & ~std::ios::showbase);
+  ost.flags(std::ios::hex & ~std::ios::showbase);
+  ost << lo;    
+  os << "0x" << hi;
+  for(uint32_t x = 0; x < 16 - ost.str().length(); ++x){
+    os << "0";
   }
+  os << ost.str();
+  os.flags( bflags );
   return os;
 }
 
+unsigned __int128 stouint128(const std::string &str) {
+  uint64_t hi, lo;
+  // TODO: make this more efficient (maybe).
+  std::string copy;
+  if (str[0] == '0' && str[1] == 'x') {
+    copy = std::string(str.begin() + 2, str.end());
+  } else {
+    copy = str;
+  }
+
+  // Convert each section of 8-bytes (16 characters)
+  assert(copy.size() <= 32);
+  if (copy.size() <= 16) {
+    hi = 0;
+    lo = std::stoull(copy, nullptr, 16); 
+  } else {
+    auto mid = copy.end() - 16;
+    hi = std::stoull(std::string(copy.begin(), mid), nullptr, 16);
+    lo = std::stoull(std::string(mid, copy.end()), nullptr, 16);
+  }
+
+  // Combine the two 64-bit values.
+  unsigned __int128 val;
+  val = hi;
+  val = val << 64;
+  val += lo;
+  return val;
+}
+
 } // end of namespace flit
  
diff --git a/src/flitHelpers.hpp b/src/flitHelpers.h
similarity index 99%
rename from src/flitHelpers.hpp
rename to src/flitHelpers.h
index 387dbaad..6502164d 100644
--- a/src/flitHelpers.hpp
+++ b/src/flitHelpers.h
@@ -5,8 +5,8 @@
 #ifndef FLIT_HELPERS_HPP
 #define FLIT_HELPERS_HPP
 
-#include "InfoStream.hpp"
-#include "CUHelpers.hpp"
+#include "InfoStream.h"
+#include "CUHelpers.h"
 
 #include <algorithm>
 #include <iostream>
@@ -89,6 +89,7 @@ std::vector<T> const &
 getRandSeq();
 
 std::ostream& operator<<(std::ostream&, const unsigned __int128);
+unsigned __int128 stouint128(const std::string &str);
 
 HOST_DEVICE
 inline float