Skip to content

Commit

Permalink
Chore: specify pandas version to 2.0.0 (#38)
Browse files Browse the repository at this point in the history
* Chore: specify pandas version to 2.0.0

* bugfix: fix character repetition method

* tests: modify rep_len of character_repetition_filter test

* bugfix: check analyzer output directory
  • Loading branch information
zhijianma authored Oct 20, 2023
1 parent 95b292b commit 1c5f08a
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 42 deletions.
32 changes: 21 additions & 11 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,15 @@ def analyze_and_show_res():
analyzer = Analyser(cfg)
dataset = analyzer.run()

analysis_res_ori = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))
overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
analysis_res_ori = pd.DataFrame()
if os.path.exists(overall_file):
analysis_res_ori = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))

st.session_state.dataset = dataset
st.session_state.original_overall = analysis_res_ori
Expand Down Expand Up @@ -171,12 +175,16 @@ def process_and_show_res():
analyzer.analysis_path = os.path.dirname(
cfg_for_processed_data.export_path) + '/analysis'
analyzer.run()
analysis_res_processed = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_processed.append(
os.path.join(analyzer.analysis_path, f_path))

overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
if os.path.exists(overall_file):
analysis_res_processed = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_processed.append(
os.path.join(analyzer.analysis_path, f_path))
except Exception as e:
st.warning(f'Something error with {str(e)}')

Expand Down Expand Up @@ -221,6 +229,8 @@ class Visualize:

@staticmethod
def filter_dataset(dataset):
if Fields.stats not in dataset:
return
text_key = st.session_state.get('text_key', 'text')
text = dataset[text_key]
stats = pd.DataFrame(dataset[Fields.stats])
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/character_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ def compute_stats(self, sample):

freq_char_ngrams = sorted(list(freq_char_ngrams.values()),
reverse=True)
rep_more_than_one = len([el for el in freq_char_ngrams if el > 1])
num_no_rep_char_ngrams = len([el for el in freq_char_ngrams if el == 1])
num_rep_char_ngrams = min(
int(np.sqrt(len(freq_char_ngrams))),
len(freq_char_ngrams) - rep_more_than_one,
len(freq_char_ngrams) - num_no_rep_char_ngrams,
)
sample[Fields.stats][StatsKeys.char_rep_ratio] = (sum(
freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams)) \
Expand Down
30 changes: 19 additions & 11 deletions demos/data_process_loop/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,15 @@ def analyze_and_show_res():
analyzer = Analyser(cfg)
analyzed_dataset = analyzer.run()

analysis_res_ori = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))
overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
analysis_res_ori = pd.DataFrame()
if os.path.exists(overall_file):
analysis_res_ori = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))

st.session_state.analyzed_dataset = analyzed_dataset
st.session_state.original_overall = analysis_res_ori
Expand Down Expand Up @@ -132,12 +136,16 @@ def process_and_show_res():
analyzer.analysis_path = os.path.dirname(
cfg_for_processed_data.export_path) + '/analysis'
analyzer.run()
analysis_res_processed = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_processed.append(
os.path.join(analyzer.analysis_path, f_path))

overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
if os.path.exists(overall_file):
analysis_res_processed = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_processed.append(
os.path.join(analyzer.analysis_path, f_path))
else:
st.warning('No sample left after processing. Please change \
anther dataset or op parameters then rerun')
Expand Down
17 changes: 11 additions & 6 deletions demos/data_visualization_op_effect/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,15 @@ def analyze_and_show_res(dataset_file):
analyzer = Analyser(cfg)
dataset = analyzer.run()

analysis_res_ori = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))
overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
analysis_res_ori = pd.DataFrame()
if os.path.exists(overall_file):
analysis_res_ori = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))

st.session_state.dataset = dataset
st.session_state.original_overall = analysis_res_ori
Expand Down Expand Up @@ -316,7 +320,8 @@ def op_effect_analyze():

@staticmethod
def filter_dataset(dataset):

if Fields.stats not in dataset:
return
text_key = st.session_state.get('text_key', 'text')
text = dataset[text_key]
stats = pd.DataFrame(dataset[Fields.stats])
Expand Down
14 changes: 9 additions & 5 deletions demos/data_visualization_statistics/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,15 @@ def analyze_and_show_res(dataset_file):
analyzer = Analyser(cfg)
dataset = analyzer.run()

analysis_res_ori = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))
overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
analysis_res_ori = pd.DataFrame()
if os.path.exists(overall_file):
analysis_res_ori = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))

st.session_state.dataset = dataset
st.session_state.original_overall = analysis_res_ori
Expand Down
14 changes: 9 additions & 5 deletions demos/overview_scan/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,15 @@ def run_demo():
with st.spinner('Wait for analyze...'):
analyzer.run()

analysis_res_ori = pd.read_csv(
os.path.join(analyzer.analysis_path, 'overall.csv'))
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))
overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
analysis_res_ori = pd.DataFrame()
if os.path.exists(overall_file):
analysis_res_ori = pd.read_csv(overall_file)

if os.path.exists(analyzer.analysis_path):
for f_path in os.listdir(analyzer.analysis_path):
if '.png' in f_path and 'all-stats' in f_path:
images_ori.append(os.path.join(analyzer.analysis_path, f_path))

st.subheader('Statistics')
st.dataframe(analysis_res_ori, use_container_width=True)
Expand Down
2 changes: 1 addition & 1 deletion environments/minimal_requires.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pandas==2.0.0
datasets==2.11.0
loguru
tqdm
jsonargparse[signatures]
matplotlib
pandas
requests
wget
zstandard
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/filter/test_character_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_case(self):
'text': '中文也是一个字算一个长度'
}]
dataset = Dataset.from_list(ds_list)
op = CharacterRepetitionFilter(rep_len=10,
op = CharacterRepetitionFilter(rep_len=5,
min_ratio=0.0,
max_ratio=0.4)
self._run_character_repetition_filter(dataset, tgt_list, op)
Expand Down

0 comments on commit 1c5f08a

Please sign in to comment.