From 32f40dd354ccb18824f9153655e45216265733f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=93=E8=BE=95?= Date: Tue, 19 Dec 2023 15:26:20 +0800 Subject: [PATCH] - added support for jsonl.zst type when calling get_access_log() --- data_juicer/utils/constant.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index a58bc3b34..1eeccf998 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -1,5 +1,6 @@ import copy import inspect +import io import os import zstandard as zstd @@ -62,9 +63,9 @@ def get_access_log(cls, dj_cfg=None): # Create a stream reader for the file and decode the # first line with dctx.stream_reader(compressed_file) as reader: - first_line_bytes = reader.readline() - # Assuming the file is encoded in UTF-8 - first_line = first_line_bytes.decode('utf-8') + text_stream = io.TextIOWrapper( + reader, encoding='utf-8') + first_line = text_stream.readline() elif 'jsonl' in dj_cfg.dataset_path: tmp_f_name = dj_cfg.dataset_path. \ replace('.jsonl', '.tmp.jsonl')