Skip to content

Commit

Permalink
* avoid unaligned columns when converting and processing the dataset …
Browse files Browse the repository at this point in the history
…due to missing column or unaligned data type
  • Loading branch information
HYLcool committed Nov 6, 2023
1 parent 5fc1f62 commit 473b088
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
2 changes: 2 additions & 0 deletions data_juicer/ops/filter/image_aspect_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def compute_stats(self, sample, context=False):

# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.stats][StatsKeys.aspect_ratios] = np.array(
[], dtype=np.float64)
return sample

# load images
Expand Down
3 changes: 1 addition & 2 deletions tools/multimodal/ds2dj/llava2dj.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,8 @@ def main(
new_sample = {
'id': id,
text_key: text,
image_key: images,
}
if images:
new_sample[image_key] = images
writer.write(new_sample)
logger.info(f'Store the target dataset into [{target_ds_path}].')

Expand Down

0 comments on commit 473b088

Please sign in to comment.