added data file format benchmark

yuriihavrylko · Jan 4, 2024 · 15cbd16 · 15cbd16
1 parent 4f17819
commit 15cbd16
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -54,4 +54,27 @@ kubectl create -f deployment/minio.yml
 Run tests
 ```
 pytest app/tests/
-```
+```
+
+### Benchmarks
+
+Fileformats
+
+![Alt text](assets/format_benchmark.png)
+
+| Format   | Avg Write Time (s) | Avg Read Time (s) | File Size after Write (MB) |
+|----------|--------------------|-------------------|----------------------------|
+| CSV      | 0.906960           | 0.174510          | 5.649742                   |
+| JSON     | 0.386252           | 1.161783          | 16.038124                  |
+| PARQUET  | 0.061314           | 0.016811          | 1.507380                   |
+| ORC      | 0.167490           | 0.016776          | 6.998336                   |
+
+
+
+CSV format shows relatively slower write times compared to other formats, with a moderate file size after write.
+
+JSON format demonstrates faster write times but slower read times compared to other formats, with the largest file size after write.
+
+PARQUET format showcases the fastest write times and relatively fast read times, with a smaller file size after write compared to CSV and JSON.
+
+ORC format exhibits moderate write times and the smallest file size after write among the tested formats, with efficient read times.
diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt
@@ -1,3 +1,5 @@
 -r requirements.txt
 pytest==7.4.4
 pytest-mock==3.12.0
+datasets==2.16.1
+pandas==2.1.4
diff --git a/assets/format_benchmark.png b/assets/format_benchmark.png
diff --git a/benchmarks/data_formats.py b/benchmarks/data_formats.py
@@ -0,0 +1,40 @@
+from datasets import load_dataset
+import pandas as pd
+import timeit
+import os
+
+dataset = load_dataset("inria-soda/tabular-benchmark", 'clf_cat_albert')
+
+df = pd.DataFrame(dataset['train'])
+
+file_formats = {
+    'csv': {
+        'write': lambda df: df.to_csv('sample.csv', index=False),
+        'read': lambda: pd.read_csv('sample.csv')
+    },
+    'json': {
+        'write': lambda df: df.to_json('sample.json', orient='records'),
+        'read': lambda: pd.read_json('sample.json')
+    },
+    'parquet': {
+        'write': lambda df: df.to_parquet('sample.parquet', index=False),
+        'read': lambda: pd.read_parquet('sample.parquet')
+    },
+    'orc': {
+        'write': lambda df: df.to_orc('sample.orc', index=False),
+        'read': lambda: pd.read_orc('sample.orc')
+    }
+}
+
+for format, methods in file_formats.items():
+    print(f"Testing for {format.upper()} format:")
+
+    write_time = timeit.timeit(lambda: methods['write'](df), number=10)
+    read_time = timeit.timeit(methods['read'], number=10)
+
+    write_file_size = os.path.getsize(f'sample.{format}') / (1024 * 1024)  # Convert bytes to MB
+
+    print(f"Average time taken to write: {write_time / 10:.6f} seconds")
+    print(f"Average time taken to read: {read_time / 10:.6f} seconds")
+    print(f"File size after write: {write_file_size:.6f} MB")
+    print("\n")