Skip to content

Commit

Permalink
debug video_key and to_bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Mar 11, 2024
1 parent da6440a commit f6bf2be
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 2 deletions.
2 changes: 2 additions & 0 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,13 @@ def init_setup_from_cfg(cfg):
'text_key': text_key,
'image_key': cfg.image_key,
'audio_key': cfg.audio_key,
'video_key': cfg.video_key,
}
elif args['text_key'] is None:
args['text_key'] = text_key
args['image_key'] = cfg.image_key
args['audio_key'] = cfg.audio_key
args['video_key'] = cfg.video_key
op[op_name] = args

return cfg
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def non_empty_text(sample, target_keys):
if video_key in dataset.features:
data_path_keys.append(video_key)
if len(data_path_keys) == 0:
# no image/audios path list in dataset, no need to convert
# no image/audio/videos path list in dataset, no need to convert
return dataset

if ds_dir == '':
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/video_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def compute_hash(self, sample, context=False):
# consider the multi stream of video in one container
for packet in videos[key].demux():
if packet.stream.type == 'video':
md5_hash.update(packet.to_bytes())
md5_hash.update(bytes(packet))

sample[HashKeys.videohash] = md5_hash.hexdigest()
return sample
Expand Down

0 comments on commit f6bf2be

Please sign in to comment.