-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Benchmark] Refactor benchmark system.
Instead of using a different script for each experiment and DBMS to benchmark, implement `connectors` to these DBMS's. The connector has a method to execute an experiment with the given parameters and returns the measured times. In addition, the format of the YAML files of the experiments has been refactored to contain all the information and parameters to execute them on each connector. `Benchmark.py` is refactored as well to read the experiment files and execute them on each available specified connector, with possibly multiple configurations. Some more minor changes: - Benchmark script now has the option to execute one (or multiple) specific experiments. - The `run_id` of each experiment run is tracked and inserted into the database.
- Loading branch information
Tobias Kopp
committed
Apr 21, 2023
1 parent
1d6cd0b
commit ba6948f
Showing
32 changed files
with
3,559 additions
and
1,977 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
class Connector(ABC): | ||
|
||
# Function that performs an experiment n_runs times given the parameters `params`. | ||
# Returns a dict with the measured times for the experiment and configuration. | ||
# Result has the form: | ||
# results | ||
# └── configurations | ||
# └── cases | ||
# └── times (list) | ||
# | ||
# results: configuration name --> configuration | ||
# configuration: case --> times | ||
# times: list of floats (size=n_runs) | ||
# | ||
# Example: (n_runs=2) | ||
# { | ||
# 'PostgreSQL': | ||
# 1: [1235.093, 1143.43], | ||
# 2: [1033.711, 1337.37], | ||
# 3: [1043.452, 1010.01], | ||
# 4: [1108.702, 1234.56] | ||
# } | ||
@abstractmethod | ||
def execute(self, n_runs: int, params: dict): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
from .connector import * | ||
|
||
import duckdb | ||
import os | ||
import json | ||
|
||
|
||
TMP_DB = 'tmp.duckdb' | ||
TMP_SQL_FILE = 'tmp.sql' | ||
|
||
# TODO way of measuring time is wrong. Use duckdb_cli like in older version. | ||
|
||
class DuckDB(Connector): | ||
|
||
def __new__(cls, *args, **kwargs): | ||
return super().__new__(cls) | ||
|
||
|
||
def __init__(self, duckdb_cli, verbose=False): | ||
self.duckdb_cli=duckdb_cli | ||
|
||
|
||
# Runs an experiment 'n_runs' times, all parameters are in 'params' | ||
def execute(self, n_runs, params: dict): | ||
self.clean_up() | ||
|
||
measurement_times = dict() # map that is returned with the measured times | ||
|
||
# Check wether tables contain scale factors | ||
with_scale_factors = False | ||
for table in params['data'].values(): | ||
if (table.get('scale_factors')): | ||
with_scale_factors = True | ||
break | ||
|
||
for _ in range(n_runs): | ||
try: | ||
# Set up database | ||
self.generate_create_table_stmts(params['data'], with_scale_factors) | ||
|
||
|
||
# If tables contain scale factors, they have to be loaded separately for every case | ||
if (with_scale_factors and bool(params.get('readonly'))): | ||
# Write cases/queries to a file that will be passed to the command to execute | ||
statements = list() | ||
for case, query_stmt in params['cases'].items(): | ||
# Create tables from tmp tables with scale factor | ||
for table_name, table in params['data'].items(): | ||
statements.append(f"DELETE FROM {table_name};") # empty existing table | ||
if table.get('scale_factors'): | ||
sf = table['scale_factors'][case] | ||
else: | ||
sf = 1 | ||
header = int(table.get('header', 0)) | ||
num_rows = round((table['lines_in_file'] - header) * sf) | ||
statements.append(f"INSERT INTO {table_name} SELECT * FROM {table_name}_tmp LIMIT {num_rows};") | ||
|
||
statements.append(".timer on") | ||
statements.append(query_stmt) # Actual query from this case | ||
statements.append(".timer off") | ||
|
||
# Append statements to file | ||
with open(TMP_SQL_FILE, "a+") as tmp: | ||
for stmt in statements: | ||
tmp.write(stmt + "\n") | ||
|
||
|
||
|
||
# Otherwise, tables have to be created just once before the measurements (done above) | ||
else: | ||
# Write cases/queries to a file that will be passed to the command to execute | ||
with open(TMP_SQL_FILE, "a+") as tmp: | ||
tmp.write(".timer on\n") | ||
for case_query in params['cases'].values(): | ||
tmp.write(case_query + '\n') | ||
tmp.write(".timer off\n") | ||
|
||
|
||
# Execute query file and collect measurement data | ||
command = f"./{self.duckdb_cli} {TMP_DB} < {TMP_SQL_FILE}" + " | grep 'Run Time' | cut -d ' ' -f 5 | awk '{print $1 * 1000;}'" | ||
stream = os.popen(f'{command}') | ||
for idx, line in enumerate(stream): | ||
time = float(line.replace("\n", "").replace(",", ".")) # in milliseconds | ||
case = list(params['cases'].keys())[idx] | ||
if case not in measurement_times.keys(): | ||
measurement_times[case] = list() | ||
measurement_times[case].append(time) | ||
stream.close() | ||
|
||
|
||
finally: | ||
self.clean_up() | ||
|
||
return {'DuckDB': measurement_times} | ||
|
||
|
||
# Deletes the used temporary database | ||
def clean_up(self): | ||
if os.path.exists(TMP_DB): | ||
os.remove(TMP_DB) | ||
if os.path.exists(TMP_SQL_FILE): | ||
os.remove(TMP_SQL_FILE) | ||
|
||
|
||
# Parse attributes of one table, return as string | ||
def parse_attributes(self, attributes: dict): | ||
columns = '(' | ||
for column_name, ty in attributes.items(): | ||
not_null = 'NOT NULL' if 'NOT NULL' in ty else '' | ||
ty = ty.split(' ') | ||
match (ty[0]): | ||
case 'INT': | ||
typ = 'INT' | ||
case 'CHAR': | ||
typ = f'CHAR({ty[1]})' | ||
case 'DECIMAL': | ||
typ = f'DECIMAL({ty[1]},{ty[2]})' | ||
case 'DATE': | ||
typ = 'DATE' | ||
case 'DOUBLE': | ||
typ = 'DOUBLE' | ||
case 'FLOAT': | ||
typ = 'REAL' | ||
case 'BIGINT': | ||
typ = 'BIGINT' | ||
case _: | ||
raise Exception(f"Unknown type given for '{column_name}'") | ||
columns += f"{column_name} {typ} {not_null}, " | ||
columns = columns[:-2] + ')' | ||
return columns | ||
|
||
|
||
# Creates tables in the database and copies contents of given files into them | ||
# Call with 'with_scale_factors'=False if data should be loaded as a whole | ||
# Call with 'with_scale_factors'=True if data should be placed in tmp tables | ||
# and copied for each case with different scale factor | ||
def generate_create_table_stmts(self, data: dict, with_scale_factors): | ||
statements = list() | ||
for table_name, table in data.items(): | ||
columns = self.parse_attributes(table['attributes']) | ||
|
||
delimiter = table.get('delimiter') | ||
header = table.get('header') | ||
format = table['format'].upper() | ||
|
||
if with_scale_factors: | ||
table_name += "_tmp" | ||
|
||
create = f"CREATE TABLE {table_name} {columns};" | ||
copy = f"COPY {table_name} FROM '{table['file']}' ( " | ||
if delimiter: | ||
delim = delimiter.replace("'", "") | ||
copy += f" DELIMITER \'{delim}\'," | ||
if format: | ||
copy += f" FORMAT {format}," | ||
if header: | ||
copy += f" HEADER," if (header==1) else "" | ||
|
||
copy = copy[:-1] + " );" | ||
|
||
statements.append(create) | ||
statements.append(copy) | ||
|
||
if with_scale_factors: | ||
# Create actual table that will be used for experiment | ||
statements.append(f"CREATE TABLE {table_name[:-4]} {columns};") | ||
|
||
with open(TMP_SQL_FILE, "w") as tmp: | ||
for stmt in statements: | ||
tmp.write(stmt + "\n") |
Oops, something went wrong.