From 1c5f08a60a313fb9a43374b41a53e5ddd0aad302 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Fri, 20 Oct 2023 17:36:28 +0800 Subject: [PATCH] Chore: specify pandas version to 2.0.0 (#38) * Chore: specify pandas version to 2.0.0 * bugfix: fix character repetition method * tests: modify rep_len of character_repetition_filter test * bugfix: check analyzer output directory --- app.py | 32 ++++++++++++------- .../ops/filter/character_repetition_filter.py | 4 +-- demos/data_process_loop/app.py | 30 ++++++++++------- demos/data_visualization_op_effect/app.py | 17 ++++++---- demos/data_visualization_statistics/app.py | 14 +++++--- demos/overview_scan/app.py | 14 +++++--- environments/minimal_requires.txt | 2 +- .../test_character_repetition_filter.py | 2 +- 8 files changed, 73 insertions(+), 42 deletions(-) diff --git a/app.py b/app.py index 0b5c66ac0..830edf340 100644 --- a/app.py +++ b/app.py @@ -137,11 +137,15 @@ def analyze_and_show_res(): analyzer = Analyser(cfg) dataset = analyzer.run() - analysis_res_ori = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_ori.append(os.path.join(analyzer.analysis_path, f_path)) + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + analysis_res_ori = pd.DataFrame() + if os.path.exists(overall_file): + analysis_res_ori = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_ori.append(os.path.join(analyzer.analysis_path, f_path)) st.session_state.dataset = dataset st.session_state.original_overall = analysis_res_ori @@ -171,12 +175,16 @@ def process_and_show_res(): analyzer.analysis_path = os.path.dirname( cfg_for_processed_data.export_path) + '/analysis' analyzer.run() - analysis_res_processed = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_processed.append( - os.path.join(analyzer.analysis_path, f_path)) + + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + if os.path.exists(overall_file): + analysis_res_processed = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_processed.append( + os.path.join(analyzer.analysis_path, f_path)) except Exception as e: st.warning(f'Something error with {str(e)}') @@ -221,6 +229,8 @@ class Visualize: @staticmethod def filter_dataset(dataset): + if Fields.stats not in dataset: + return text_key = st.session_state.get('text_key', 'text') text = dataset[text_key] stats = pd.DataFrame(dataset[Fields.stats]) diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py index 2e463c471..f6b65e35c 100644 --- a/data_juicer/ops/filter/character_repetition_filter.py +++ b/data_juicer/ops/filter/character_repetition_filter.py @@ -59,10 +59,10 @@ def compute_stats(self, sample): freq_char_ngrams = sorted(list(freq_char_ngrams.values()), reverse=True) - rep_more_than_one = len([el for el in freq_char_ngrams if el > 1]) + num_no_rep_char_ngrams = len([el for el in freq_char_ngrams if el == 1]) num_rep_char_ngrams = min( int(np.sqrt(len(freq_char_ngrams))), - len(freq_char_ngrams) - rep_more_than_one, + len(freq_char_ngrams) - num_no_rep_char_ngrams, ) sample[Fields.stats][StatsKeys.char_rep_ratio] = (sum( freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams)) \ diff --git a/demos/data_process_loop/app.py b/demos/data_process_loop/app.py index a1ae3d01e..901efd9b9 100644 --- a/demos/data_process_loop/app.py +++ b/demos/data_process_loop/app.py @@ -96,11 +96,15 @@ def analyze_and_show_res(): analyzer = Analyser(cfg) analyzed_dataset = analyzer.run() - analysis_res_ori = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_ori.append(os.path.join(analyzer.analysis_path, f_path)) + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + analysis_res_ori = pd.DataFrame() + if os.path.exists(overall_file): + analysis_res_ori = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_ori.append(os.path.join(analyzer.analysis_path, f_path)) st.session_state.analyzed_dataset = analyzed_dataset st.session_state.original_overall = analysis_res_ori @@ -132,12 +136,16 @@ def process_and_show_res(): analyzer.analysis_path = os.path.dirname( cfg_for_processed_data.export_path) + '/analysis' analyzer.run() - analysis_res_processed = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_processed.append( - os.path.join(analyzer.analysis_path, f_path)) + + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + if os.path.exists(overall_file): + analysis_res_processed = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_processed.append( + os.path.join(analyzer.analysis_path, f_path)) else: st.warning('No sample left after processing. Please change \ anther dataset or op parameters then rerun') diff --git a/demos/data_visualization_op_effect/app.py b/demos/data_visualization_op_effect/app.py index 0e6f60fc8..c5a944cf5 100644 --- a/demos/data_visualization_op_effect/app.py +++ b/demos/data_visualization_op_effect/app.py @@ -108,11 +108,15 @@ def analyze_and_show_res(dataset_file): analyzer = Analyser(cfg) dataset = analyzer.run() - analysis_res_ori = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_ori.append(os.path.join(analyzer.analysis_path, f_path)) + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + analysis_res_ori = pd.DataFrame() + if os.path.exists(overall_file): + analysis_res_ori = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_ori.append(os.path.join(analyzer.analysis_path, f_path)) st.session_state.dataset = dataset st.session_state.original_overall = analysis_res_ori @@ -316,7 +320,8 @@ def op_effect_analyze(): @staticmethod def filter_dataset(dataset): - + if Fields.stats not in dataset: + return text_key = st.session_state.get('text_key', 'text') text = dataset[text_key] stats = pd.DataFrame(dataset[Fields.stats]) diff --git a/demos/data_visualization_statistics/app.py b/demos/data_visualization_statistics/app.py index 2c9e9c966..e4f14f1bc 100644 --- a/demos/data_visualization_statistics/app.py +++ b/demos/data_visualization_statistics/app.py @@ -93,11 +93,15 @@ def analyze_and_show_res(dataset_file): analyzer = Analyser(cfg) dataset = analyzer.run() - analysis_res_ori = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_ori.append(os.path.join(analyzer.analysis_path, f_path)) + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + analysis_res_ori = pd.DataFrame() + if os.path.exists(overall_file): + analysis_res_ori = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_ori.append(os.path.join(analyzer.analysis_path, f_path)) st.session_state.dataset = dataset st.session_state.original_overall = analysis_res_ori diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py index 064e65240..d3fbbc000 100644 --- a/demos/overview_scan/app.py +++ b/demos/overview_scan/app.py @@ -204,11 +204,15 @@ def run_demo(): with st.spinner('Wait for analyze...'): analyzer.run() - analysis_res_ori = pd.read_csv( - os.path.join(analyzer.analysis_path, 'overall.csv')) - for f_path in os.listdir(analyzer.analysis_path): - if '.png' in f_path and 'all-stats' in f_path: - images_ori.append(os.path.join(analyzer.analysis_path, f_path)) + overall_file = os.path.join(analyzer.analysis_path, 'overall.csv') + analysis_res_ori = pd.DataFrame() + if os.path.exists(overall_file): + analysis_res_ori = pd.read_csv(overall_file) + + if os.path.exists(analyzer.analysis_path): + for f_path in os.listdir(analyzer.analysis_path): + if '.png' in f_path and 'all-stats' in f_path: + images_ori.append(os.path.join(analyzer.analysis_path, f_path)) st.subheader('Statistics') st.dataframe(analysis_res_ori, use_container_width=True) diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index fa1dab89e..723494975 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -1,9 +1,9 @@ +pandas==2.0.0 datasets==2.11.0 loguru tqdm jsonargparse[signatures] matplotlib -pandas requests wget zstandard diff --git a/tests/ops/filter/test_character_repetition_filter.py b/tests/ops/filter/test_character_repetition_filter.py index e05f47f59..1d5683c82 100644 --- a/tests/ops/filter/test_character_repetition_filter.py +++ b/tests/ops/filter/test_character_repetition_filter.py @@ -41,7 +41,7 @@ def test_case(self): 'text': '中文也是一个字算一个长度' }] dataset = Dataset.from_list(ds_list) - op = CharacterRepetitionFilter(rep_len=10, + op = CharacterRepetitionFilter(rep_len=5, min_ratio=0.0, max_ratio=0.4) self._run_character_repetition_filter(dataset, tgt_list, op)