From 1c5f08a60a313fb9a43374b41a53e5ddd0aad302 Mon Sep 17 00:00:00 2001
From: zhijianma <zhijian.mzj@alibaba-inc.com>
Date: Fri, 20 Oct 2023 17:36:28 +0800
Subject: [PATCH] Chore: specify pandas version to 2.0.0 (#38)

* Chore: specify pandas version to 2.0.0

* bugfix: fix character repetition method

* tests: modify rep_len of character_repetition_filter test

* bugfix: check analyzer output directory
---
 app.py                                        | 32 ++++++++++++-------
 .../ops/filter/character_repetition_filter.py |  4 +--
 demos/data_process_loop/app.py                | 30 ++++++++++-------
 demos/data_visualization_op_effect/app.py     | 17 ++++++----
 demos/data_visualization_statistics/app.py    | 14 +++++---
 demos/overview_scan/app.py                    | 14 +++++---
 environments/minimal_requires.txt             |  2 +-
 .../test_character_repetition_filter.py       |  2 +-
 8 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/app.py b/app.py
index 0b5c66ac0..830edf340 100644
--- a/app.py
+++ b/app.py
@@ -137,11 +137,15 @@ def analyze_and_show_res():
     analyzer = Analyser(cfg)
     dataset = analyzer.run()
 
-    analysis_res_ori = pd.read_csv(
-        os.path.join(analyzer.analysis_path, 'overall.csv'))
-    for f_path in os.listdir(analyzer.analysis_path):
-        if '.png' in f_path and 'all-stats' in f_path:
-            images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+    overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+    analysis_res_ori = pd.DataFrame()
+    if os.path.exists(overall_file):
+        analysis_res_ori = pd.read_csv(overall_file)
+
+    if os.path.exists(analyzer.analysis_path):
+        for f_path in os.listdir(analyzer.analysis_path):
+            if '.png' in f_path and 'all-stats' in f_path:
+                images_ori.append(os.path.join(analyzer.analysis_path, f_path))
 
     st.session_state.dataset = dataset
     st.session_state.original_overall = analysis_res_ori
@@ -171,12 +175,16 @@ def process_and_show_res():
         analyzer.analysis_path = os.path.dirname(
             cfg_for_processed_data.export_path) + '/analysis'
         analyzer.run()
-        analysis_res_processed = pd.read_csv(
-            os.path.join(analyzer.analysis_path, 'overall.csv'))
-        for f_path in os.listdir(analyzer.analysis_path):
-            if '.png' in f_path and 'all-stats' in f_path:
-                images_processed.append(
-                    os.path.join(analyzer.analysis_path, f_path))
+
+        overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+        if os.path.exists(overall_file):
+            analysis_res_processed = pd.read_csv(overall_file)
+
+        if os.path.exists(analyzer.analysis_path):
+            for f_path in os.listdir(analyzer.analysis_path):
+                if '.png' in f_path and 'all-stats' in f_path:
+                    images_processed.append(
+                        os.path.join(analyzer.analysis_path, f_path))
     except Exception as e:
         st.warning(f'Something error with {str(e)}')
 
@@ -221,6 +229,8 @@ class Visualize:
 
     @staticmethod
     def filter_dataset(dataset):
+        if Fields.stats not in dataset:
+            return 
         text_key = st.session_state.get('text_key', 'text')
         text = dataset[text_key]
         stats = pd.DataFrame(dataset[Fields.stats])
diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py
index 2e463c471..f6b65e35c 100644
--- a/data_juicer/ops/filter/character_repetition_filter.py
+++ b/data_juicer/ops/filter/character_repetition_filter.py
@@ -59,10 +59,10 @@ def compute_stats(self, sample):
 
         freq_char_ngrams = sorted(list(freq_char_ngrams.values()),
                                   reverse=True)
-        rep_more_than_one = len([el for el in freq_char_ngrams if el > 1])
+        num_no_rep_char_ngrams = len([el for el in freq_char_ngrams if el == 1])
         num_rep_char_ngrams = min(
             int(np.sqrt(len(freq_char_ngrams))),
-            len(freq_char_ngrams) - rep_more_than_one,
+            len(freq_char_ngrams) - num_no_rep_char_ngrams,
         )
         sample[Fields.stats][StatsKeys.char_rep_ratio] = (sum(
             freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams)) \
diff --git a/demos/data_process_loop/app.py b/demos/data_process_loop/app.py
index a1ae3d01e..901efd9b9 100644
--- a/demos/data_process_loop/app.py
+++ b/demos/data_process_loop/app.py
@@ -96,11 +96,15 @@ def analyze_and_show_res():
     analyzer = Analyser(cfg)
     analyzed_dataset = analyzer.run()
 
-    analysis_res_ori = pd.read_csv(
-        os.path.join(analyzer.analysis_path, 'overall.csv'))
-    for f_path in os.listdir(analyzer.analysis_path):
-        if '.png' in f_path and 'all-stats' in f_path:
-            images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+    overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+    analysis_res_ori = pd.DataFrame()
+    if os.path.exists(overall_file):
+        analysis_res_ori = pd.read_csv(overall_file)
+
+    if os.path.exists(analyzer.analysis_path):
+        for f_path in os.listdir(analyzer.analysis_path):
+            if '.png' in f_path and 'all-stats' in f_path:
+                images_ori.append(os.path.join(analyzer.analysis_path, f_path))
 
     st.session_state.analyzed_dataset = analyzed_dataset
     st.session_state.original_overall = analysis_res_ori
@@ -132,12 +136,16 @@ def process_and_show_res():
             analyzer.analysis_path = os.path.dirname(
                 cfg_for_processed_data.export_path) + '/analysis'
             analyzer.run()
-            analysis_res_processed = pd.read_csv(
-                os.path.join(analyzer.analysis_path, 'overall.csv'))
-            for f_path in os.listdir(analyzer.analysis_path):
-                if '.png' in f_path and 'all-stats' in f_path:
-                    images_processed.append(
-                        os.path.join(analyzer.analysis_path, f_path))
+            
+            overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+            if os.path.exists(overall_file):
+                analysis_res_processed = pd.read_csv(overall_file)
+                
+            if os.path.exists(analyzer.analysis_path):
+                for f_path in os.listdir(analyzer.analysis_path):
+                    if '.png' in f_path and 'all-stats' in f_path:
+                        images_processed.append(
+                            os.path.join(analyzer.analysis_path, f_path))
         else:
             st.warning('No sample left after processing. Please change \
                 anther dataset or op parameters then rerun')
diff --git a/demos/data_visualization_op_effect/app.py b/demos/data_visualization_op_effect/app.py
index 0e6f60fc8..c5a944cf5 100644
--- a/demos/data_visualization_op_effect/app.py
+++ b/demos/data_visualization_op_effect/app.py
@@ -108,11 +108,15 @@ def analyze_and_show_res(dataset_file):
     analyzer = Analyser(cfg)
     dataset = analyzer.run()
 
-    analysis_res_ori = pd.read_csv(
-        os.path.join(analyzer.analysis_path, 'overall.csv'))
-    for f_path in os.listdir(analyzer.analysis_path):
-        if '.png' in f_path and 'all-stats' in f_path:
-            images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+    overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+    analysis_res_ori = pd.DataFrame()
+    if os.path.exists(overall_file):
+        analysis_res_ori = pd.read_csv(overall_file)
+        
+    if os.path.exists(analyzer.analysis_path):
+        for f_path in os.listdir(analyzer.analysis_path):
+            if '.png' in f_path and 'all-stats' in f_path:
+                images_ori.append(os.path.join(analyzer.analysis_path, f_path))
 
     st.session_state.dataset = dataset
     st.session_state.original_overall = analysis_res_ori
@@ -316,7 +320,8 @@ def op_effect_analyze():
 
     @staticmethod
     def filter_dataset(dataset):
-
+        if Fields.stats not in dataset:
+            return 
         text_key = st.session_state.get('text_key', 'text')
         text = dataset[text_key]
         stats = pd.DataFrame(dataset[Fields.stats])
diff --git a/demos/data_visualization_statistics/app.py b/demos/data_visualization_statistics/app.py
index 2c9e9c966..e4f14f1bc 100644
--- a/demos/data_visualization_statistics/app.py
+++ b/demos/data_visualization_statistics/app.py
@@ -93,11 +93,15 @@ def analyze_and_show_res(dataset_file):
     analyzer = Analyser(cfg)
     dataset = analyzer.run()
 
-    analysis_res_ori = pd.read_csv(
-        os.path.join(analyzer.analysis_path, 'overall.csv'))
-    for f_path in os.listdir(analyzer.analysis_path):
-        if '.png' in f_path and 'all-stats' in f_path:
-            images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+    overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+    analysis_res_ori = pd.DataFrame()
+    if os.path.exists(overall_file):
+        analysis_res_ori = pd.read_csv(overall_file)
+        
+    if os.path.exists(analyzer.analysis_path):
+        for f_path in os.listdir(analyzer.analysis_path):
+            if '.png' in f_path and 'all-stats' in f_path:
+                images_ori.append(os.path.join(analyzer.analysis_path, f_path))
 
     st.session_state.dataset = dataset
     st.session_state.original_overall = analysis_res_ori
diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py
index 064e65240..d3fbbc000 100644
--- a/demos/overview_scan/app.py
+++ b/demos/overview_scan/app.py
@@ -204,11 +204,15 @@ def run_demo():
         with st.spinner('Wait for analyze...'):
             analyzer.run()
 
-        analysis_res_ori = pd.read_csv(
-            os.path.join(analyzer.analysis_path, 'overall.csv'))
-        for f_path in os.listdir(analyzer.analysis_path):
-            if '.png' in f_path and 'all-stats' in f_path:
-                images_ori.append(os.path.join(analyzer.analysis_path, f_path))
+        overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
+        analysis_res_ori = pd.DataFrame()
+        if os.path.exists(overall_file):
+            analysis_res_ori = pd.read_csv(overall_file)
+            
+        if os.path.exists(analyzer.analysis_path):
+            for f_path in os.listdir(analyzer.analysis_path):
+                if '.png' in f_path and 'all-stats' in f_path:
+                    images_ori.append(os.path.join(analyzer.analysis_path, f_path))
 
         st.subheader('Statistics')
         st.dataframe(analysis_res_ori, use_container_width=True)
diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
index fa1dab89e..723494975 100644
--- a/environments/minimal_requires.txt
+++ b/environments/minimal_requires.txt
@@ -1,9 +1,9 @@
+pandas==2.0.0
 datasets==2.11.0
 loguru
 tqdm
 jsonargparse[signatures]
 matplotlib
-pandas
 requests
 wget
 zstandard
diff --git a/tests/ops/filter/test_character_repetition_filter.py b/tests/ops/filter/test_character_repetition_filter.py
index e05f47f59..1d5683c82 100644
--- a/tests/ops/filter/test_character_repetition_filter.py
+++ b/tests/ops/filter/test_character_repetition_filter.py
@@ -41,7 +41,7 @@ def test_case(self):
             'text': '中文也是一个字算一个长度'
         }]
         dataset = Dataset.from_list(ds_list)
-        op = CharacterRepetitionFilter(rep_len=10,
+        op = CharacterRepetitionFilter(rep_len=5,
                                        min_ratio=0.0,
                                        max_ratio=0.4)
         self._run_character_repetition_filter(dataset, tgt_list, op)