Skip to content

Commit

Permalink
[Benchmark] Refactor benchmark system.
Browse files Browse the repository at this point in the history
Instead of using a different script for each experiment and DBMS to
benchmark, implement `connectors` to these DBMS's. The connector has a
method to execute an experiment with the given parameters and returns
the measured times.

In addition, the format of the YAML files of the experiments has been
refactored to contain all the information and parameters to execute them
on each connector.

`Benchmark.py` is refactored as well to read the experiment files and
execute them on each available specified connector, with possibly
multiple configurations.

Some more minor changes:
- Benchmark script now has the option to execute one (or multiple)
  specific experiments.
- The `run_id` of each experiment run is tracked and inserted into the
  database.
  • Loading branch information
Tobias Kopp committed Apr 21, 2023
1 parent 1d6cd0b commit ba6948f
Show file tree
Hide file tree
Showing 32 changed files with 3,559 additions and 1,977 deletions.
404 changes: 106 additions & 298 deletions benchmark/Benchmark.py

Large diffs are not rendered by default.

57 changes: 35 additions & 22 deletions benchmark/_schema.yml
Original file line number Diff line number Diff line change
@@ -1,28 +1,19 @@
description: str()
version: int(required=False, min=1)
suite: str()
benchmark: str()
name: str(required=False)
readonly: bool()
pattern: str()
args: str(required=False)
configurations: map(str(), required=False)
tables: include('table_list')
cases: map(any(str(), include('case')), key=any())
compare_to: map(required=False)
chart: include('chart', required=False)
chart: include('chart_def', required=False)
data: map(include('table'), required=False) # Map from table name to 'table'
systems:
mutable: include('mutable', required=False)
PostgreSQL: include('PostgreSQL', required=False)
DuckDB: include('DuckDB', required=False)
HyPer: include('HyPer', required=False)
---
table:
name: str() # table name
path: str(required=False) # path to table file
sf: num(required=False, min=0, max=1) # scale factor as portion of file to load; defaults to 1
delimiter: str(required=False) # defaults to ','
header: int(required=False) # 1 if file has header, 0 otherwise; defaults to 0
table_list:
list(any(str(), include('table')), required=True)
case:
query: str()
tables: include('table_list')
chart_def:
x: include('axis', required=False)
y: include('axis', required=False)
---
axis:
# Kind of scale, one of
Expand All @@ -37,6 +28,28 @@ axis:
type: str(required=False)
# A label for the axis
label: str(required=False)
chart:
x: include('axis', required=False)
y: include('axis', required=False)
---
table:
attributes: map(str(), key=str(), required=False) # table column names and types
file: str(required=False) # path to table file
delimiter: str(required=False) # defaults to ','
header: int() # 1 if file has header, 0 otherwise; defaults to 0
format: str(required=False) # file format
scale_factors: map(num(min=0, max=1), required=False) # map from case name to scale factor (portion of file to load)
lines_in_file: int(required=False) # Number of lines in the file. Is counted and added by the benchmark script
---
mutable:
cases: include('cases')
pattern: str()
args: str(required=False)
configurations: map(str(), required=False)
PostgreSQL:
cases: include('cases')
DuckDB:
cases: include('cases')
HyPer:
single_core: bool(required=False)
all_cores: bool(required=False)
cases: include('cases')
---
cases: map(str(), key=any())
27 changes: 27 additions & 0 deletions benchmark/database_connectors/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from abc import ABC, abstractmethod

class Connector(ABC):

# Function that performs an experiment n_runs times given the parameters `params`.
# Returns a dict with the measured times for the experiment and configuration.
# Result has the form:
# results
# └── configurations
# └── cases
# └── times (list)
#
# results: configuration name --> configuration
# configuration: case --> times
# times: list of floats (size=n_runs)
#
# Example: (n_runs=2)
# {
# 'PostgreSQL':
# 1: [1235.093, 1143.43],
# 2: [1033.711, 1337.37],
# 3: [1043.452, 1010.01],
# 4: [1108.702, 1234.56]
# }
@abstractmethod
def execute(self, n_runs: int, params: dict):
pass
170 changes: 170 additions & 0 deletions benchmark/database_connectors/duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from .connector import *

import duckdb
import os
import json


TMP_DB = 'tmp.duckdb'
TMP_SQL_FILE = 'tmp.sql'

# TODO way of measuring time is wrong. Use duckdb_cli like in older version.

class DuckDB(Connector):

def __new__(cls, *args, **kwargs):
return super().__new__(cls)


def __init__(self, duckdb_cli, verbose=False):
self.duckdb_cli=duckdb_cli


# Runs an experiment 'n_runs' times, all parameters are in 'params'
def execute(self, n_runs, params: dict):
self.clean_up()

measurement_times = dict() # map that is returned with the measured times

# Check wether tables contain scale factors
with_scale_factors = False
for table in params['data'].values():
if (table.get('scale_factors')):
with_scale_factors = True
break

for _ in range(n_runs):
try:
# Set up database
self.generate_create_table_stmts(params['data'], with_scale_factors)


# If tables contain scale factors, they have to be loaded separately for every case
if (with_scale_factors and bool(params.get('readonly'))):
# Write cases/queries to a file that will be passed to the command to execute
statements = list()
for case, query_stmt in params['cases'].items():
# Create tables from tmp tables with scale factor
for table_name, table in params['data'].items():
statements.append(f"DELETE FROM {table_name};") # empty existing table
if table.get('scale_factors'):
sf = table['scale_factors'][case]
else:
sf = 1
header = int(table.get('header', 0))
num_rows = round((table['lines_in_file'] - header) * sf)
statements.append(f"INSERT INTO {table_name} SELECT * FROM {table_name}_tmp LIMIT {num_rows};")

statements.append(".timer on")
statements.append(query_stmt) # Actual query from this case
statements.append(".timer off")

# Append statements to file
with open(TMP_SQL_FILE, "a+") as tmp:
for stmt in statements:
tmp.write(stmt + "\n")



# Otherwise, tables have to be created just once before the measurements (done above)
else:
# Write cases/queries to a file that will be passed to the command to execute
with open(TMP_SQL_FILE, "a+") as tmp:
tmp.write(".timer on\n")
for case_query in params['cases'].values():
tmp.write(case_query + '\n')
tmp.write(".timer off\n")


# Execute query file and collect measurement data
command = f"./{self.duckdb_cli} {TMP_DB} < {TMP_SQL_FILE}" + " | grep 'Run Time' | cut -d ' ' -f 5 | awk '{print $1 * 1000;}'"
stream = os.popen(f'{command}')
for idx, line in enumerate(stream):
time = float(line.replace("\n", "").replace(",", ".")) # in milliseconds
case = list(params['cases'].keys())[idx]
if case not in measurement_times.keys():
measurement_times[case] = list()
measurement_times[case].append(time)
stream.close()


finally:
self.clean_up()

return {'DuckDB': measurement_times}


# Deletes the used temporary database
def clean_up(self):
if os.path.exists(TMP_DB):
os.remove(TMP_DB)
if os.path.exists(TMP_SQL_FILE):
os.remove(TMP_SQL_FILE)


# Parse attributes of one table, return as string
def parse_attributes(self, attributes: dict):
columns = '('
for column_name, ty in attributes.items():
not_null = 'NOT NULL' if 'NOT NULL' in ty else ''
ty = ty.split(' ')
match (ty[0]):
case 'INT':
typ = 'INT'
case 'CHAR':
typ = f'CHAR({ty[1]})'
case 'DECIMAL':
typ = f'DECIMAL({ty[1]},{ty[2]})'
case 'DATE':
typ = 'DATE'
case 'DOUBLE':
typ = 'DOUBLE'
case 'FLOAT':
typ = 'REAL'
case 'BIGINT':
typ = 'BIGINT'
case _:
raise Exception(f"Unknown type given for '{column_name}'")
columns += f"{column_name} {typ} {not_null}, "
columns = columns[:-2] + ')'
return columns


# Creates tables in the database and copies contents of given files into them
# Call with 'with_scale_factors'=False if data should be loaded as a whole
# Call with 'with_scale_factors'=True if data should be placed in tmp tables
# and copied for each case with different scale factor
def generate_create_table_stmts(self, data: dict, with_scale_factors):
statements = list()
for table_name, table in data.items():
columns = self.parse_attributes(table['attributes'])

delimiter = table.get('delimiter')
header = table.get('header')
format = table['format'].upper()

if with_scale_factors:
table_name += "_tmp"

create = f"CREATE TABLE {table_name} {columns};"
copy = f"COPY {table_name} FROM '{table['file']}' ( "
if delimiter:
delim = delimiter.replace("'", "")
copy += f" DELIMITER \'{delim}\',"
if format:
copy += f" FORMAT {format},"
if header:
copy += f" HEADER," if (header==1) else ""

copy = copy[:-1] + " );"

statements.append(create)
statements.append(copy)

if with_scale_factors:
# Create actual table that will be used for experiment
statements.append(f"CREATE TABLE {table_name[:-4]} {columns};")

with open(TMP_SQL_FILE, "w") as tmp:
for stmt in statements:
tmp.write(stmt + "\n")
Loading

0 comments on commit ba6948f

Please sign in to comment.