diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index f75c4bef3..8865d12e9 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -420,11 +420,13 @@ def init_setup_from_cfg(cfg): 'text_key': text_key, 'image_key': cfg.image_key, 'audio_key': cfg.audio_key, + 'video_key': cfg.video_key, } elif args['text_key'] is None: args['text_key'] = text_key args['image_key'] = cfg.image_key args['audio_key'] = cfg.audio_key + args['video_key'] = cfg.video_key op[op_name] = args return cfg diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py index 987a3667b..d17b34074 100644 --- a/data_juicer/format/formatter.py +++ b/data_juicer/format/formatter.py @@ -228,7 +228,7 @@ def non_empty_text(sample, target_keys): if video_key in dataset.features: data_path_keys.append(video_key) if len(data_path_keys) == 0: - # no image/audios path list in dataset, no need to convert + # no image/audio/videos path list in dataset, no need to convert return dataset if ds_dir == '': diff --git a/data_juicer/ops/deduplicator/video_deduplicator.py b/data_juicer/ops/deduplicator/video_deduplicator.py index d06de9f69..205e5a8d3 100644 --- a/data_juicer/ops/deduplicator/video_deduplicator.py +++ b/data_juicer/ops/deduplicator/video_deduplicator.py @@ -49,7 +49,7 @@ def compute_hash(self, sample, context=False): # consider the multi stream of video in one container for packet in videos[key].demux(): if packet.stream.type == 'video': - md5_hash.update(packet.to_bytes()) + md5_hash.update(bytes(packet)) sample[HashKeys.videohash] = md5_hash.hexdigest() return sample