Skip to content

Commit

Permalink
Merge pull request #27 from yuriihavrylko/feature/l3t3-format-benchmark
Browse files Browse the repository at this point in the history
L3T3 Format benchmark
  • Loading branch information
yuriihavrylko authored Feb 25, 2024
2 parents 7ff5d76 + fc0014e commit 779ae46
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 2 deletions.
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,36 @@ Kubernetes
kubectl create -f deployment/minio.yml
```

### Tests

Run tests
```
pytest app/tests/
```

### Benchmarks

Fileformats

![Alt text](assets/format_benchmark.png)

| Format | Avg Write Time (s) | Avg Read Time (s) | File Size after Write (MB) |
|----------|--------------------|-------------------|----------------------------|
| CSV | 0.906960 | 0.174510 | 5.649742 |
| JSON | 0.386252 | 1.161783 | 16.038124 |
| PARQUET | 0.061314 | 0.016811 | 1.507380 |
| ORC | 0.167490 | 0.016776 | 6.998336 |



CSV format shows relatively slower write times compared to other formats, with a moderate file size after write.

JSON format demonstrates faster write times but slower read times compared to other formats, with the largest file size after write.

PARQUET format showcases the fastest write times and relatively fast read times, with a smaller file size after write compared to CSV and JSON.

ORC format exhibits moderate write times and the smallest file size after write among the tested formats, with efficient read times.
=======
### POD autoscaling

Install metric service
Expand Down Expand Up @@ -249,4 +279,3 @@ python -m src.monitoring.drift
```

![Alt text](assets/drift.png)

6 changes: 5 additions & 1 deletion app/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
-r requirements.txt

pytest==7.4.4
pytest-mock==3.12.0
datasets==2.16.1
pandas==2.0.3
evaluate==0.4.1
great-expectations==0.18.7
pytest==7.4.4
Expand All @@ -13,3 +16,4 @@ textpruner==1.1.post2
evidently==0.4.13
sentence_transformers==2.2.2
ipykernel==6.28.0

2 changes: 2 additions & 0 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
flask==3.0.0
gunicorn==21.2.0
boto3==1.34.12
botocore==1.34.12
transformers==4.36.2
streamlit==1.29.0
fastapi>=0.95.0
Expand Down
40 changes: 40 additions & 0 deletions app/src/helpers/storage/minio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import boto3
from botocore.exceptions import ClientError

class MinioClient:

def __init__(self, endpoint_url, access_key, secret_key):
self.s3_client = boto3.client('s3',
endpoint_url=endpoint_url,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key)

def create_bucket(self, bucket_name):
try:
self.s3_client.create_bucket(Bucket=bucket_name)
except ClientError as e:
raise e

def upload_file(self, bucket_name, file_name, file_path):
try:
self.s3_client.upload_file(file_path, bucket_name, file_name)
except ClientError as e:
raise e

def download_file(self, bucket_name, file_name, download_path):
try:
self.s3_client.download_file(bucket_name, file_name, download_path)
except ClientError as e:
raise e

def delete_file(self, bucket_name, file_name):
try:
self.s3_client.delete_object(Bucket=bucket_name, Key=file_name)
except ClientError as e:
raise e

def delete_bucket(self, bucket_name):
try:
self.s3_client.delete_bucket(Bucket=bucket_name)
except ClientError as e:
raise e
Empty file added app/tests/helpers/__init__.py
Empty file.
Empty file.
34 changes: 34 additions & 0 deletions app/tests/helpers/storage/test_minio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest
from src.helpers.storage.minio import MinioClient

@pytest.fixture
def minio_client():
endpoint_url = 'http://localhost:9000'
access_key = 'minio_access_key'
secret_key = 'minio_secret_key'
return MinioClient(endpoint_url, access_key, secret_key)

def test_create_bucket(mocker, minio_client):
mock_s3 = mocker.patch.object(minio_client.s3_client, 'create_bucket')
minio_client.create_bucket('testbucket')
mock_s3.assert_called_once_with(Bucket='testbucket')

def test_upload_file(mocker, minio_client):
mock_upload_file = mocker.patch.object(minio_client.s3_client, 'upload_file')
minio_client.upload_file('testbucket', 'test.txt', '/path/to/test.txt')
mock_upload_file.assert_called_once_with('/path/to/test.txt', 'testbucket', 'test.txt')

def test_download_file(mocker, minio_client):
mock_download_file = mocker.patch.object(minio_client.s3_client, 'download_file')
minio_client.download_file('testbucket', 'test.txt', '/path/to/downloaded/test.txt')
mock_download_file.assert_called_once_with('testbucket', 'test.txt', '/path/to/downloaded/test.txt')

def test_delete_file(mocker, minio_client):
mock_delete_object = mocker.patch.object(minio_client.s3_client, 'delete_object')
minio_client.delete_file('testbucket', 'test.txt')
mock_delete_object.assert_called_once_with(Bucket='testbucket', Key='test.txt')

def test_delete_bucket(mocker, minio_client):
mock_delete_bucket = mocker.patch.object(minio_client.s3_client, 'delete_bucket')
minio_client.delete_bucket('testbucket')
mock_delete_bucket.assert_called_once_with(Bucket='testbucket')
Binary file added assets/format_benchmark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
40 changes: 40 additions & 0 deletions benchmarks/data_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from datasets import load_dataset
import pandas as pd
import timeit
import os

dataset = load_dataset("inria-soda/tabular-benchmark", 'clf_cat_albert')

df = pd.DataFrame(dataset['train'])

file_formats = {
'csv': {
'write': lambda df: df.to_csv('sample.csv', index=False),
'read': lambda: pd.read_csv('sample.csv')
},
'json': {
'write': lambda df: df.to_json('sample.json', orient='records'),
'read': lambda: pd.read_json('sample.json')
},
'parquet': {
'write': lambda df: df.to_parquet('sample.parquet', index=False),
'read': lambda: pd.read_parquet('sample.parquet')
},
'orc': {
'write': lambda df: df.to_orc('sample.orc', index=False),
'read': lambda: pd.read_orc('sample.orc')
}
}

for format, methods in file_formats.items():
print(f"Testing for {format.upper()} format:")

write_time = timeit.timeit(lambda: methods['write'](df), number=10)
read_time = timeit.timeit(methods['read'], number=10)

write_file_size = os.path.getsize(f'sample.{format}') / (1024 * 1024) # Convert bytes to MB

print(f"Average time taken to write: {write_time / 10:.6f} seconds")
print(f"Average time taken to read: {read_time / 10:.6f} seconds")
print(f"File size after write: {write_file_size:.6f} MB")
print("\n")

0 comments on commit 779ae46

Please sign in to comment.