Skip to content

Commit

Permalink
Fix/several minor bugs (#303)
Browse files Browse the repository at this point in the history
* * throw explicit errors when video path doesn't exist

* * Suppress the debug info when initializing data-juicer
* add debug mode

* + add mode to load_video func

* + add mode to load_video func

* * update pandas to 2.0.3

* * change ValueError to FileNotFoundError
+ add detailed comments
  • Loading branch information
HYLcool authored Apr 24, 2024
1 parent 517efe1 commit eaf7746
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 15 deletions.
6 changes: 6 additions & 0 deletions data_juicer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

import os
import subprocess
import sys

import multiprocess as mp
from loguru import logger

from data_juicer.utils.availability_utils import _is_package_available

# For now, only INFO will be shown. Later the severity level will be changed
# when setup_logger is called to initialize the logger.
logger.remove()
logger.add(sys.stderr, level='INFO')


def _cuda_device_count():
_torch_available = _is_package_available('torch')
Expand Down
8 changes: 8 additions & 0 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ def init_configs(args=None):
default='auto',
help='The address of the Ray cluster.')

parser.add_argument('--debug',
action='store_true',
help='Whether to run in debug mode.')

# add all parameters of the registered ops class to the parser,
# and these op parameters can be modified through the command line,
ops_sorted_by_types = sort_op_by_types_and_names(OPERATORS.modules.items())
Expand All @@ -330,6 +334,9 @@ def init_configs(args=None):
global_cfg = cfg
global_parser = parser

if cfg.debug:
logger.debug('In DEBUG mode.')

return cfg
except ArgumentError:
logger.error('Config initialization failed')
Expand Down Expand Up @@ -378,6 +385,7 @@ def init_setup_from_cfg(cfg):
logfile_name = f'export_{export_rel_path}_time_{timestamp}.txt'
setup_logger(save_dir=log_dir,
filename=logfile_name,
level='DEBUG' if cfg.debug else 'INFO',
redirect=cfg.executor_type == 'default')

# check and get dataset dir
Expand Down
6 changes: 4 additions & 2 deletions data_juicer/utils/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def setup_logger(save_dir,
distributed_rank=0,
filename='log.txt',
mode='o',
level='INFO',
redirect=True):
"""
Setup logger for training and testing.
Expand All @@ -104,6 +105,7 @@ def setup_logger(save_dir,
:param distributed_rank: device rank when multi-gpu environment
:param filename: log file name to save
:param mode: log file write mode, `append` or `override`. default is `o`.
:param level: log severity level. It's "INFO" in default.
:param redirect: whether to redirect system output
:return: logger instance.
"""
Expand All @@ -127,14 +129,14 @@ def setup_logger(save_dir,
logger.add(
sys.stderr,
format=loguru_format,
level='INFO',
level=level,
enqueue=True,
)
logger.add(save_file)

# redirect stdout/stderr to loguru
if redirect:
redirect_sys_output('INFO')
redirect_sys_output(level)
LOGGER_SETUP = True


Expand Down
27 changes: 15 additions & 12 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,19 @@ def load_videos(paths):
return [load_video(path) for path in paths]


def load_video(path):
def load_video(path, mode='r'):
"""
Load a video using its path.
:param path: the path to this video.
:param mode: the loading mode. It's "r" in default.
:return: a container object form PyAv library, which contains all streams
in this video (video/audio/...) and can be used to decode these streams
to frames.
"""
container = av.open(path)
if not os.path.exists(path) and 'r' in mode:
raise FileNotFoundError(f'Video [{path}] does not exist!')
container = av.open(path, mode)
return container


Expand All @@ -188,7 +191,7 @@ def get_video_duration(input_video: Union[str, av.container.InputContainer],
:return: duration of the video in second
"""
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
elif isinstance(input_video, av.container.InputContainer):
container = input_video
else:
Expand All @@ -215,7 +218,7 @@ def get_decoded_frames_from_video(
:return: an iterator of all the frames of the video
"""
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
elif isinstance(input_video, av.container.InputContainer):
container = input_video
stream = container.streams.video[video_stream_index]
Expand Down Expand Up @@ -243,12 +246,12 @@ def cut_video_by_seconds(
"""
# open the original video
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
else:
container = input_video

# create the output video
output_container = av.open(output_video, 'w')
output_container = load_video(output_video, 'w')

# add the video stream into the output video according to input video
input_video_stream = container.streams.video[0]
Expand Down Expand Up @@ -340,12 +343,12 @@ def process_each_frame(input_video: Union[str, av.container.InputContainer],

# open the original video
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
else:
container = input_video

# create the output video
output_container = av.open(output_video, 'w')
output_container = load_video(output_video, 'w')

# add the audio stream into the output video with template of input audio
for input_audio_stream in container.streams.audio:
Expand Down Expand Up @@ -402,7 +405,7 @@ def extract_key_frames(input_video: Union[str, av.container.InputContainer]):
"""
# load the input video
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
elif isinstance(input_video, av.container.InputContainer):
container = input_video
else:
Expand Down Expand Up @@ -462,7 +465,7 @@ def extract_video_frames_uniformly(
"""
# load the input video
if isinstance(input_video, str):
container = av.open(input_video)
container = load_video(input_video)
elif isinstance(input_video, av.container.InputContainer):
container = input_video
else:
Expand Down Expand Up @@ -587,7 +590,7 @@ def extract_audio_from_video(
all audio streams will be extracted. Default: None.
"""
if isinstance(input_video, str):
input_container = av.open(input_video)
input_container = load_video(input_video)
elif isinstance(input_video, av.container.InputContainer):
input_container = input_video
else:
Expand Down Expand Up @@ -627,7 +630,7 @@ def extract_audio_from_video(
if output_audio:
# if the output_audio is not None, prepare the output audio file
this_output_audio = add_suffix_to_filename(output_audio, f'_{idx}')
output_container = av.open(this_output_audio, 'w')
output_container = load_video(this_output_audio, 'w')
output_stream = output_container.add_stream('mp3')

# get the start/end pts
Expand Down
2 changes: 1 addition & 1 deletion environments/minimal_requires.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
fsspec==2023.5.0
pyarrow<=12.0.0
pandas==2.0.0
pandas==2.0.3
datasets==2.11.0
av
soundfile
Expand Down

0 comments on commit eaf7746

Please sign in to comment.