diff --git a/README.md b/README.md index 7937a4b..b699135 100644 --- a/README.md +++ b/README.md @@ -54,4 +54,27 @@ kubectl create -f deployment/minio.yml Run tests ``` pytest app/tests/ -``` \ No newline at end of file +``` + +### Benchmarks + +Fileformats + +![Alt text](assets/format_benchmark.png) + +| Format | Avg Write Time (s) | Avg Read Time (s) | File Size after Write (MB) | +|----------|--------------------|-------------------|----------------------------| +| CSV | 0.906960 | 0.174510 | 5.649742 | +| JSON | 0.386252 | 1.161783 | 16.038124 | +| PARQUET | 0.061314 | 0.016811 | 1.507380 | +| ORC | 0.167490 | 0.016776 | 6.998336 | + + + +CSV format shows relatively slower write times compared to other formats, with a moderate file size after write. + +JSON format demonstrates faster write times but slower read times compared to other formats, with the largest file size after write. + +PARQUET format showcases the fastest write times and relatively fast read times, with a smaller file size after write compared to CSV and JSON. + +ORC format exhibits moderate write times and the smallest file size after write among the tested formats, with efficient read times. diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt index c5bbb1f..25db5c8 100644 --- a/app/requirements-dev.txt +++ b/app/requirements-dev.txt @@ -1,3 +1,5 @@ -r requirements.txt pytest==7.4.4 pytest-mock==3.12.0 +datasets==2.16.1 +pandas==2.1.4 diff --git a/assets/format_benchmark.png b/assets/format_benchmark.png new file mode 100644 index 0000000..b9e96d9 Binary files /dev/null and b/assets/format_benchmark.png differ diff --git a/benchmarks/data_formats.py b/benchmarks/data_formats.py new file mode 100644 index 0000000..fd95b24 --- /dev/null +++ b/benchmarks/data_formats.py @@ -0,0 +1,40 @@ +from datasets import load_dataset +import pandas as pd +import timeit +import os + +dataset = load_dataset("inria-soda/tabular-benchmark", 'clf_cat_albert') + +df = pd.DataFrame(dataset['train']) + +file_formats = { + 'csv': { + 'write': lambda df: df.to_csv('sample.csv', index=False), + 'read': lambda: pd.read_csv('sample.csv') + }, + 'json': { + 'write': lambda df: df.to_json('sample.json', orient='records'), + 'read': lambda: pd.read_json('sample.json') + }, + 'parquet': { + 'write': lambda df: df.to_parquet('sample.parquet', index=False), + 'read': lambda: pd.read_parquet('sample.parquet') + }, + 'orc': { + 'write': lambda df: df.to_orc('sample.orc', index=False), + 'read': lambda: pd.read_orc('sample.orc') + } +} + +for format, methods in file_formats.items(): + print(f"Testing for {format.upper()} format:") + + write_time = timeit.timeit(lambda: methods['write'](df), number=10) + read_time = timeit.timeit(methods['read'], number=10) + + write_file_size = os.path.getsize(f'sample.{format}') / (1024 * 1024) # Convert bytes to MB + + print(f"Average time taken to write: {write_time / 10:.6f} seconds") + print(f"Average time taken to read: {read_time / 10:.6f} seconds") + print(f"File size after write: {write_file_size:.6f} MB") + print("\n")