From 717248e858dab7063ceff6fab2b87730da19c8dd Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Thu, 7 Mar 2024 06:03:37 +0000
Subject: [PATCH] fix sentence-filter adding separator bug and add document for
 structured_compress_prompt

---
 README.md                      | 26 ++++++++++++++++++++-
 llmlingua/prompt_compressor.py | 42 ++++++++++++----------------------
 2 files changed, 39 insertions(+), 29 deletions(-)
diff --git a/README.md b/README.md
index c886e80..eaeab6a 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,31 @@ llm_lingua = PromptCompressor("microsoft/phi-2")
 llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
 ```
 
-#### 3. **Learning More:**
+#### 3. **Using (Long)LLMLingua for Structured Prompt Compression:**
+
+You can split the text into different sections and define whether a section should be compressed and the rate of compression to apply.
+When using this method, each element of context should be segmented using one or more non-nested `<llmlingua></llmlingua>` tags. Each `<llmlingua>` tag can include optional parameters `rate` and `compress` (e.g., `<llmlingua, rate=0.3, compress=True>`), indicating the compression strategy for that segment.
+
+```python
+from llmlingua import PromptCompressor
+
+llm_lingua = PromptCompressor()
+structured_prompt = """<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.</llmlingua><llmlingua, compress=False>
+Speaker 0:</llmlingua><llmlingua, rate=0.4> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.</llmlingua><llmlingua, compress=False>
+Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua><llmlingua, compress=False>
+Speaker 2:</llmlingua><llmlingua, rate=0.6> Now. I had queued up to motion, but.</llmlingua><llmlingua, compress=False>
+Speaker 4:</llmlingua><llmlingua, rate=0.6> Great that we have any public comment on this.</llmlingua>"""
+compressed_prompt = llm_lingua.structured_compress_prompt(structured_prompt, instruction="", question="", rate=0.5)
+print(compressed_prompt['compressed_prompt'])
+
+# > Speaker 4:. And can we do the functions for content? Items I believe are11,116 28,.
+# Speaker 0:1 is a from Council on Price recommendation to increaseation the fund group the City Manager Department $20 to provide a the the.1 is Councilman Super Now. the specialising and group the manager provide the of summerman manager $ a the Jazzels a communicationman. increase the group a the Little.1man Allen0 provide to Casa Kor, Sewia, of and Dave Van communication. from Mayor Council Member Mur to Ronmer. Basketball andic
+# Speaker 4: a a second time asman servedman Ring and and have
+# Speaker 2: Now. I had queued up to motion, but.
+# Speaker 4: Great that we have any public comment on this.
+```
+
+#### 4. **Learning More:**
 
 To understand how to apply LLMLingua and LongLLMLingua in real-world scenarios like RAG, Online Meetings, CoT, and Code, please refer to our [**examples**](./examples). For detailed guidance, the [**documentation**](./DOCUMENT.md) provides extensive recommendations on effectively utilizing LLMLingua.
 
diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index dce95d9..c2d3306 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -835,34 +835,23 @@ def keep_sentence(dem_idx: int, sent_keep: int):
             for idx in idxs:
                 sentence_ppl[idx] += high_priority_bonus
 
-        def sync_sentence(segments, text):
-            seg_num = len(segments)
-            new_segments = []
-            text_seen = 0
-            seg_idx, cur_seg_seen = 0, 0
-            for i, s in enumerate(text):
-                while seg_idx < seg_num and s != segments[seg_idx][cur_seg_seen]:
-                    if cur_seg_seen < len(segments[seg_idx]) - 1:
-                        cur_seg_seen += 1
-                        continue
-                    new_segments.append(text[text_seen:i])
-                    text_seen = i
-                    seg_idx += 1
-                    cur_seg_seen = 0
-                cur_seg_seen += 1
-                if seg_idx == seg_num:
+        def sync_sentence(sentences, text):
+            seen_text = 0
+            sentence_num = len(sentences)
+            new_sentences = []
+            for i, s in enumerate(sentences):
+                assert s == text[seen_text: seen_text + len(s)]
+                if i == sentence_num - 1:
+                    new_sentences.append(text[seen_text:])
                     break
-                if cur_seg_seen == len(segments[seg_idx]):
-                    new_segments.append(text[text_seen : i + 1])
-                    text_seen = i + 1
-                    seg_idx += 1
-                    cur_seg_seen = 0
-            if text_seen < len(text):
-                new_segments.append(text[text_seen:])
-            assert len("".join(new_segments)) == len(text)
-            return new_segments
+                next_sentence_start = text.find(sentences[i + 1][:5], seen_text + len(s))
+                new_sentences.append(text[seen_text: next_sentence_start])
+                seen_text = next_sentence_start
+            assert "".join(new_sentences) == text
+            return new_sentences
 
         sentences = [nltk.sent_tokenize(c) for c in context]
+        sentences = [sync_sentence(s, c) for s, c in zip(sentences, context)]
         dem_g, s2de, idx = defaultdict(set), defaultdict(int), 0
         for idx_d, s in enumerate(sentences):
             for _ in s:
@@ -871,9 +860,6 @@ def sync_sentence(segments, text):
                 idx += 1
 
         if context_segs is not None:
-            context_segs = [
-                sync_sentence(s, "".join(c)) for s, c in zip(context_segs, sentences)
-            ]
             sen2seg_ratio = {}
             idx = 0
             for idx_d, sentences_each_context in enumerate(sentences):