diff --git a/README.md b/README.md index 6f51cb6e..9d87c5bc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This repository allows you to segment text into sentences or other semantic unit - **SaT** — [Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation](https://arxiv.org/abs/2406.16678) by Markus Frohmann, Igor Sterner, Benjamin Minixhofer, Ivan Vulić and Markus Schedl (**state-of-the-art, encouraged**). - **WtP** — [Where’s the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation](https://aclanthology.org/2023.acl-long.398/) by Benjamin Minixhofer, Jonas Pfeiffer and Ivan Vulić (*previous version, maintained for reproducibility*). -The namesake WtP is maintained for reproducibility. Our new followup SaT provides robust, efficient and adaptable sentence segmentation across 85 languages at higher performance and less compute cost. Check out the **state-of-the-art** results in 8 distinct corpora and 85 languages demonstrated in our [Segment any Text paper](https://arxiv.org/abs/2406.16678). +The namesake WtP is maintained for consistency. Our new followup SaT provides robust, efficient and adaptable sentence segmentation across 85 languages at higher performance and less compute cost. Check out the **state-of-the-art** results in 8 distinct corpora and 85 languages demonstrated in our [Segment any Text paper](https://arxiv.org/abs/2406.16678). ![System Figure](./configs/system-fig.png) @@ -154,11 +154,9 @@ Clone the repository and install requirements: ``` git clone https://github.com/segment-any-text/wtpsplit -cd segment-any-text -pip install -e . +cd wtpsplit pip install -r requirements.txt -cd adapters -pip install -e . +pip install adapters==0.2.1 --no-dependencies cd .. ``` diff --git a/requirements.txt b/requirements.txt index 6d3536f7..33ac92b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,9 @@ cohere replicate onnx onnxruntime -torchinfo \ No newline at end of file +torchinfo +mosestokenizer +cached_property +tqdm +skops +pandas \ No newline at end of file diff --git a/setup.py b/setup.py index 24dd9002..49514e9b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ "pandas>=1", "cached_property", # for Py37 "mosestokenizer", - "adapters==0.2.1" + "adapters" ], url="https://github.com/segment-any-text/wtpsplit", package_data={"wtpsplit": ["data/*"]}, diff --git a/wtpsplit/evaluation/intrinsic_pairwise.py b/wtpsplit/evaluation/intrinsic_pairwise.py index 2da00a29..46a92a24 100644 --- a/wtpsplit/evaluation/intrinsic_pairwise.py +++ b/wtpsplit/evaluation/intrinsic_pairwise.py @@ -22,7 +22,7 @@ from wtpsplit.extract import PyTorchWrapper from wtpsplit.extract_batched import extract_batched from wtpsplit.utils import Constants, token_to_char_probs -from wtpsplit.evaluation.intrinsic import compute_statistics +from wtpsplit.evaluation.adapt import compute_statistics logger = logging.getLogger() logger.setLevel(logging.INFO) diff --git a/wtpsplit/evaluation/intrinsic_ted.py b/wtpsplit/evaluation/intrinsic_ted.py index 5840e855..9183f29a 100644 --- a/wtpsplit/evaluation/intrinsic_ted.py +++ b/wtpsplit/evaluation/intrinsic_ted.py @@ -18,7 +18,7 @@ import wtpsplit.models # noqa: F401 from wtpsplit.evaluation import get_labels, train_mixture from wtpsplit.evaluation.evaluate_sepp_nlg_subtask1 import evaluate_subtask1 -from wtpsplit.evaluation.intrinsic import process_logits +from wtpsplit.evaluation.adapt import process_logits from wtpsplit.extract import PyTorchWrapper, extract from wtpsplit.utils import Constants, sigmoid diff --git a/wtpsplit/evaluation/punct_annotation_wtp.py b/wtpsplit/evaluation/punct_annotation_wtp.py index 3d4ec76c..3a8df910 100644 --- a/wtpsplit/evaluation/punct_annotation_wtp.py +++ b/wtpsplit/evaluation/punct_annotation_wtp.py @@ -9,7 +9,7 @@ import wtpsplit.models # noqa from wtpsplit.extract import PyTorchWrapper from wtpsplit.utils import Constants -from wtpsplit.evaluation.intrinsic import process_logits +from wtpsplit.evaluation.adapt import process_logits @dataclass