Skip to content

Commit

Permalink
- added support for jsonl.zst type when calling get_access_log()
Browse files Browse the repository at this point in the history
  • Loading branch information
yxdyc committed Dec 19, 2023
1 parent 325a855 commit 32f40dd
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import inspect
import io
import os

import zstandard as zstd
Expand Down Expand Up @@ -62,9 +63,9 @@ def get_access_log(cls, dj_cfg=None):
# Create a stream reader for the file and decode the
# first line
with dctx.stream_reader(compressed_file) as reader:
first_line_bytes = reader.readline()
# Assuming the file is encoded in UTF-8
first_line = first_line_bytes.decode('utf-8')
text_stream = io.TextIOWrapper(
reader, encoding='utf-8')
first_line = text_stream.readline()
elif 'jsonl' in dj_cfg.dataset_path:
tmp_f_name = dj_cfg.dataset_path. \
replace('.jsonl', '.tmp.jsonl')
Expand Down

0 comments on commit 32f40dd

Please sign in to comment.