diff --git a/data_juicer/core/exporter.py b/data_juicer/core/exporter.py index 9450c9482..1ca6f19bf 100644 --- a/data_juicer/core/exporter.py +++ b/data_juicer/core/exporter.py @@ -176,7 +176,7 @@ def export(self, dataset): @staticmethod def to_jsonl(dataset, export_path, num_proc=1, **kwargs): """ - Export method for json/jsonl target files. + Export method for jsonl target files. :param dataset: the dataset to export. :param export_path: the path to store the exported dataset. @@ -186,6 +186,19 @@ def to_jsonl(dataset, export_path, num_proc=1, **kwargs): """ dataset.to_json(export_path, force_ascii=False, num_proc=num_proc) + @staticmethod + def to_json(dataset, export_path, num_proc=1, **kwargs): + """ + Export method for json target files. + + :param dataset: the dataset to export. + :param export_path: the path to store the exported dataset. + :param num_proc: the number of processes used to export the dataset. + :param kwargs: extra arguments. + :return: + """ + dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False) + @staticmethod def to_parquet(dataset, export_path, **kwargs): """ @@ -208,6 +221,6 @@ def _router(): """ return { 'jsonl': Exporter.to_jsonl, - 'json': Exporter.to_jsonl, + 'json': Exporter.to_json, 'parquet': Exporter.to_parquet, } diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py index bd506be81..a297463b7 100644 --- a/data_juicer/format/formatter.py +++ b/data_juicer/format/formatter.py @@ -221,6 +221,8 @@ def non_empty_text(sample, target_keys): # function to convert relative paths to absolute paths def rel2abs(sample, path_keys, dataset_dir): for path_key in path_keys: + if path_key not in sample: + continue paths = sample[path_key] if not paths: continue