Skip to content

Commit

Permalink
+ support column-wise analysis for DataFrame column of list type
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool committed Nov 29, 2023
1 parent 9c31b84 commit eb02bfe
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions data_juicer/analysis/column_wise_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from data_juicer.utils.constant import Fields

Expand Down Expand Up @@ -111,8 +112,10 @@ def analyse(self, show_percentiles=False, show=False):
fig = plt.figure(figsize=(rec_width, rec_height),
layout='constrained')
subfigs = fig.subfigures(rec_row, rec_col, wspace=0.01)
for i, column_name in enumerate(columns):
for i, column_name in tqdm(enumerate(columns), desc='Column'):
data = self.stats[column_name]
# explode data to flatten inner list
data = data.explode().infer_objects()
grid = grid_indexes[i]
if self.save_stats_in_one_file:
if rec_col == 1:
Expand All @@ -128,7 +131,8 @@ def analyse(self, show_percentiles=False, show=False):

# numeric or string via nan. Apply different plot method for them.
if pd.isna(self.overall_result[column_name].get('top')):
# numeric -- draw histogram and box plot for this stat
# numeric or numeric list -- draw histogram and box plot for
# this stat
percentiles = self.overall_result[column_name] \
if show_percentiles else None

Expand All @@ -152,7 +156,8 @@ def analyse(self, show_percentiles=False, show=False):
f'{column_name}-box.png'),
percentiles=percentiles)
else:
# object (string) -- only draw histogram for this stat
# object (string) or string list -- only draw histogram for
# this stat
if self.save_stats_in_one_file:
axes = subfig.subplots(1, 1)
else:
Expand Down

0 comments on commit eb02bfe

Please sign in to comment.