Skip to content

Commit

Permalink
fix(Dataset): not creating metadata if format not supported
Browse files Browse the repository at this point in the history
  • Loading branch information
nazarfil committed Aug 7, 2024
1 parent a347e39 commit b449ff1
Showing 1 changed file with 27 additions and 18 deletions.
45 changes: 27 additions & 18 deletions hexa/datasets/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,27 @@
logger = getLogger(__name__)


def is_supported_mimetype(filename: str) -> bool:
supported_mimetypes = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel",
"application/vnd.apache.parquet",
"text/csv",
]
supported_extensions = ["parquet"]
suffix = filename.split(".")[-1]
mime_type, encoding = mimetypes.guess_type(filename, strict=False)
return mime_type in supported_mimetypes or suffix in supported_extensions


def download_file_as_dataframe(
dataset_version_file: DatasetVersionFile,
) -> pd.DataFrame | None:
mime_type, encoding = mimetypes.guess_type(
dataset_version_file.filename, strict=False
)
download_url = generate_download_url(dataset_version_file)
print(
f"Downloading file {download_url} for filename : {dataset_version_file.filename}"
)
logger.info(f"Downloading file {download_url}")
if mime_type == "text/csv":
return pd.read_csv(download_url)
elif (
Expand All @@ -40,9 +51,6 @@ def download_file_as_dataframe(
or dataset_version_file.filename.split(".")[-1] == "parquet"
):
return pd.read_parquet(download_url)
else:
logger.info(f"Unsupported file format: {dataset_version_file.filename}")
return None


def generate_dataset_file_sample_task(
Expand All @@ -59,6 +67,10 @@ def generate_dataset_file_sample_task(
)
return

if not is_supported_mimetype(dataset_version_file.filename):
logger.info(f"Unsupported file format: {dataset_version_file.filename}")
return

logger.info(f"Creating dataset sample for version file {dataset_version_file.id}")
try:
dataset_file_metadata = DatasetFileMetadata.objects.create(
Expand All @@ -71,19 +83,16 @@ def generate_dataset_file_sample_task(

try:
file_content = download_file_as_dataframe(dataset_version_file)
if file_content is None:
dataset_file_metadata.sample = json.dumps([])
if not file_content.empty:
random_seed = 22
file_sample = file_content.sample(
settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE,
random_state=random_seed,
replace=True,
)
dataset_file_metadata.sample = file_sample.to_json(orient="records")
else:
if not file_content.empty:
random_seed = 22
file_sample = file_content.sample(
settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE,
random_state=random_seed,
replace=True,
)
dataset_file_metadata.sample = file_sample.to_json(orient="records")
else:
dataset_file_metadata.sample = json.dumps([])
dataset_file_metadata.sample = json.dumps([])
logger.info(f"Dataset sample saved for file {dataset_version_file_id}")
dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED
dataset_file_metadata.save()
Expand Down

0 comments on commit b449ff1

Please sign in to comment.