From 90ef09440330556acbfefc28d173987e38569315 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Tue, 31 Oct 2023 16:23:28 +0800 Subject: [PATCH] update op desc in overview scan --- demos/overview_scan/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py index 7ac664eb7..1050cc353 100644 --- a/demos/overview_scan/app.py +++ b/demos/overview_scan/app.py @@ -88,7 +88,7 @@ | Type | Number | Description | |-----------------------------------|:------:|-------------------------------------------------| | Formatter | 7 | Discovers, loads, and canonicalizes source data | -| Mapper | 19 | Edits and transforms samples | +| Mapper | 21 | Edits and transforms samples | | Filter | 16 | Filters out low-quality samples | | Deduplicator | 3 | Detects and removes duplicate samples | | Selector | 2 | Selects top samples based on ranking | @@ -111,6 +111,7 @@ ''' | Operator | Domain | Lang | Description | |-----------------------------------------------|--------------------|--------|----------------------------------------------------------------------------------------------------------------| +| chinese_convert_mapper | General | zh | Convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji (by [opencc](https://github.com/BYVoid/OpenCC)) | | clean_copyright_mapper | Code | en, zh | Removes copyright notice at the beginning of code files (:warning: must contain the word *copyright*) | | clean_email_mapper | General | en, zh | Removes email information | | clean_html_mapper | General | en, zh | Removes HTML tags and returns plain text of all the nodes | @@ -125,6 +126,7 @@ | remove_comments_mapper | LaTeX | en, zh | Removes the comments of TeX documents | | remove_header_mapper | LaTeX | en, zh | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names | | remove_long_words_mapper | General | en, zh | Removes words with length outside the specified range | +| remove_non_chinese_character_mapper | General | en, zh | Remove non Chinese character in text samples. | | remove_specific_chars_mapper | General | en, zh | Removes any user-specified characters or substrings | | remove_table_text_mapper | General, Financial | en | Detects and removes possible table contents (:warning: relies on regular expression matching and thus fragile) | | remove_words_with_incorrect_
substrings_mapper | General | en, zh | Removes words containing specified substrings |