Skip to content

Commit

Permalink
* avoid missing image_key error for text-only datasets
Browse files Browse the repository at this point in the history
+ support to expert to json format (not jsonl!)
  • Loading branch information
HYLcool committed Nov 8, 2023
1 parent cd817ab commit d0e69d7
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
17 changes: 15 additions & 2 deletions data_juicer/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def export(self, dataset):
@staticmethod
def to_jsonl(dataset, export_path, num_proc=1, **kwargs):
"""
Export method for json/jsonl target files.
Export method for jsonl target files.
:param dataset: the dataset to export.
:param export_path: the path to store the exported dataset.
Expand All @@ -186,6 +186,19 @@ def to_jsonl(dataset, export_path, num_proc=1, **kwargs):
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)

@staticmethod
def to_json(dataset, export_path, num_proc=1, **kwargs):
"""
Export method for json target files.
:param dataset: the dataset to export.
:param export_path: the path to store the exported dataset.
:param num_proc: the number of processes used to export the dataset.
:param kwargs: extra arguments.
:return:
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)

@staticmethod
def to_parquet(dataset, export_path, **kwargs):
"""
Expand All @@ -208,6 +221,6 @@ def _router():
"""
return {
'jsonl': Exporter.to_jsonl,
'json': Exporter.to_jsonl,
'json': Exporter.to_json,
'parquet': Exporter.to_parquet,
}
2 changes: 2 additions & 0 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def non_empty_text(sample, target_keys):
# function to convert relative paths to absolute paths
def rel2abs(sample, path_keys, dataset_dir):
for path_key in path_keys:
if path_key not in sample:
continue
paths = sample[path_key]
if not paths:
continue
Expand Down

0 comments on commit d0e69d7

Please sign in to comment.