Fix/several minor bugs (#303)

* * throw explicit errors when video path doesn't exist * * Suppress the debug info when initializing data-juicer * add debug mode * + add mode to load_video func * + add mode to load_video func * * update pandas to 2.0.3 * * change ValueError to FileNotFoundError + add detailed comments
modelscope · Apr 24, 2024 · eaf7746 · eaf7746
1 parent 517efe1
commit eaf7746
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 15 deletions.
diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
@@ -2,12 +2,18 @@
 
 import os
 import subprocess
+import sys
 
 import multiprocess as mp
 from loguru import logger
 
 from data_juicer.utils.availability_utils import _is_package_available
 
+# For now, only INFO will be shown. Later the severity level will be changed
+# when setup_logger is called to initialize the logger.
+logger.remove()
+logger.add(sys.stderr, level='INFO')
+
 
 def _cuda_device_count():
     _torch_available = _is_package_available('torch')

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -310,6 +310,10 @@ def init_configs(args=None):
                         default='auto',
                         help='The address of the Ray cluster.')
 
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='Whether to run in debug mode.')
+
     # add all parameters of the registered ops class to the parser,
     # and these op parameters can be modified through the command line,
     ops_sorted_by_types = sort_op_by_types_and_names(OPERATORS.modules.items())
@@ -330,6 +334,9 @@ def init_configs(args=None):
         global_cfg = cfg
         global_parser = parser
 
+        if cfg.debug:
+            logger.debug('In DEBUG mode.')
+
         return cfg
     except ArgumentError:
         logger.error('Config initialization failed')
@@ -378,6 +385,7 @@ def init_setup_from_cfg(cfg):
     logfile_name = f'export_{export_rel_path}_time_{timestamp}.txt'
     setup_logger(save_dir=log_dir,
                  filename=logfile_name,
+                 level='DEBUG' if cfg.debug else 'INFO',
                  redirect=cfg.executor_type == 'default')
 
     # check and get dataset dir

diff --git a/data_juicer/utils/logger_utils.py b/data_juicer/utils/logger_utils.py
@@ -96,6 +96,7 @@ def setup_logger(save_dir,
                  distributed_rank=0,
                  filename='log.txt',
                  mode='o',
+                 level='INFO',
                  redirect=True):
     """
     Setup logger for training and testing.
@@ -104,6 +105,7 @@ def setup_logger(save_dir,
     :param distributed_rank: device rank when multi-gpu environment
     :param filename: log file name to save
     :param mode: log file write mode, `append` or `override`. default is `o`.
+    :param level: log severity level. It's "INFO" in default.
     :param redirect: whether to redirect system output
     :return: logger instance.
     """
@@ -127,14 +129,14 @@ def setup_logger(save_dir,
         logger.add(
             sys.stderr,
             format=loguru_format,
-            level='INFO',
+            level=level,
             enqueue=True,
         )
         logger.add(save_file)
 
     # redirect stdout/stderr to loguru
     if redirect:
-        redirect_sys_output('INFO')
+        redirect_sys_output(level)
     LOGGER_SETUP = True
 
 

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -162,16 +162,19 @@ def load_videos(paths):
     return [load_video(path) for path in paths]
 
 
-def load_video(path):
+def load_video(path, mode='r'):
     """
     Load a video using its path.
 
     :param path: the path to this video.
+    :param mode: the loading mode. It's "r" in default.
     :return: a container object form PyAv library, which contains all streams
         in this video (video/audio/...) and can be used to decode these streams
         to frames.
     """
-    container = av.open(path)
+    if not os.path.exists(path) and 'r' in mode:
+        raise FileNotFoundError(f'Video [{path}] does not exist!')
+    container = av.open(path, mode)
     return container
 
 
@@ -188,7 +191,7 @@ def get_video_duration(input_video: Union[str, av.container.InputContainer],
     :return: duration of the video in second
     """
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     elif isinstance(input_video, av.container.InputContainer):
         container = input_video
     else:
@@ -215,7 +218,7 @@ def get_decoded_frames_from_video(
     :return: an iterator of all the frames of the video
     """
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     elif isinstance(input_video, av.container.InputContainer):
         container = input_video
     stream = container.streams.video[video_stream_index]
@@ -243,12 +246,12 @@ def cut_video_by_seconds(
     """
     # open the original video
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     else:
         container = input_video
 
     # create the output video
-    output_container = av.open(output_video, 'w')
+    output_container = load_video(output_video, 'w')
 
     # add the video stream into the output video according to input video
     input_video_stream = container.streams.video[0]
@@ -340,12 +343,12 @@ def process_each_frame(input_video: Union[str, av.container.InputContainer],
 
     # open the original video
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     else:
         container = input_video
 
     # create the output video
-    output_container = av.open(output_video, 'w')
+    output_container = load_video(output_video, 'w')
 
     # add the audio stream into the output video with template of input audio
     for input_audio_stream in container.streams.audio:
@@ -402,7 +405,7 @@ def extract_key_frames(input_video: Union[str, av.container.InputContainer]):
     """
     # load the input video
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     elif isinstance(input_video, av.container.InputContainer):
         container = input_video
     else:
@@ -462,7 +465,7 @@ def extract_video_frames_uniformly(
     """
     # load the input video
     if isinstance(input_video, str):
-        container = av.open(input_video)
+        container = load_video(input_video)
     elif isinstance(input_video, av.container.InputContainer):
         container = input_video
     else:
@@ -587,7 +590,7 @@ def extract_audio_from_video(
         all audio streams will be extracted. Default: None.
     """
     if isinstance(input_video, str):
-        input_container = av.open(input_video)
+        input_container = load_video(input_video)
     elif isinstance(input_video, av.container.InputContainer):
         input_container = input_video
     else:
@@ -627,7 +630,7 @@ def extract_audio_from_video(
         if output_audio:
             # if the output_audio is not None, prepare the output audio file
             this_output_audio = add_suffix_to_filename(output_audio, f'_{idx}')
-            output_container = av.open(this_output_audio, 'w')
+            output_container = load_video(this_output_audio, 'w')
             output_stream = output_container.add_stream('mp3')
 
         # get the start/end pts

diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
@@ -1,6 +1,6 @@
 fsspec==2023.5.0
 pyarrow<=12.0.0
-pandas==2.0.0
+pandas==2.0.3
 datasets==2.11.0
 av
 soundfile