From 9a9ee5468140b234b058cc5735229d6e26e65b26 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sat, 24 Aug 2024 22:10:23 +0300 Subject: [PATCH] Add support for a default extension name when uploading a pandas dataframe artifact (see `sdk.development.artifacts.default_pandas_dataframe_extension_name`) --- clearml/binding/artifacts.py | 20 ++++++++++++++------ docs/clearml.conf | 7 +++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/clearml/binding/artifacts.py b/clearml/binding/artifacts.py index 4874ae5a..7d4b382e 100644 --- a/clearml/binding/artifacts.py +++ b/clearml/binding/artifacts.py @@ -28,6 +28,7 @@ from ..storage.util import sha256sum, format_size, get_common_path from ..utilities.process.mp import SafeEvent, ForkSafeRLock from ..utilities.proxy_object import LazyEvalWrapper +from ..config import deferred_config try: import pandas as pd @@ -262,6 +263,9 @@ class Artifacts(object): # hashing constants _hash_block_size = 65536 _pd_artifact_type = 'data-audit-table' + _default_pandas_dataframe_extension_name = deferred_config( + "development.artifacts.default_pandas_dataframe_extension_name", None + ) class _ProxyDictWrite(dict): """ Dictionary wrapper that updates an arguments instance on any item set in the dictionary """ @@ -464,19 +468,23 @@ def get_extension(extension_name_, valid_extensions, default_extension, artifact artifact_type_data.content_type = "text/csv" np.savetxt(local_filename, artifact_object, delimiter=",") delete_after_upload = True - elif pd and isinstance(artifact_object, pd.DataFrame) \ - and (isinstance(artifact_object.index, pd.MultiIndex) or - isinstance(artifact_object.columns, pd.MultiIndex)): - store_as_pickle = True elif pd and isinstance(artifact_object, pd.DataFrame): artifact_type = "pandas" artifact_type_data.preview = preview or str(artifact_object.__repr__()) + # we are making sure self._default_pandas_dataframe_extension_name is not deferred + extension_name = extension_name or str(self._default_pandas_dataframe_extension_name or "") override_filename_ext_in_uri = get_extension( extension_name, [".csv.gz", ".parquet", ".feather", ".pickle"], ".csv.gz", artifact_type ) override_filename_in_uri = name - local_filename = self._push_temp_file(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - if override_filename_ext_in_uri == ".csv.gz": + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri + ) + if ( + isinstance(artifact_object.index, pd.MultiIndex) or isinstance(artifact_object.columns, pd.MultiIndex) + ) and not extension_name: + store_as_pickle = True + elif override_filename_ext_in_uri == ".csv.gz": artifact_type_data.content_type = "text/csv" self._store_compressed_pd_csv(artifact_object, local_filename) elif override_filename_ext_in_uri == ".parquet": diff --git a/docs/clearml.conf b/docs/clearml.conf index 5c6b52d9..1c2b5ab1 100644 --- a/docs/clearml.conf +++ b/docs/clearml.conf @@ -240,5 +240,12 @@ sdk { # iteration reporting x-axis after starting to report "seconds from start" # max_wait_for_first_iteration_to_start_sec: 1800 } + + artifacts { + # set default extension_name for pandas DataFrame objects + # valid values are: ``.csv.gz``, ``.parquet``, ``.feather``, ``.pickle`` + # extension_name supplied to Task.upload_artifact is prioritized over this value + default_pandas_dataframe_extension_name: "" + } } }