From 717248e858dab7063ceff6fab2b87730da19c8dd Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Thu, 7 Mar 2024 06:03:37 +0000
Subject: [PATCH 1/4] fix sentence-filter adding separator bug and add document
 for structured_compress_prompt

---
 README.md                      | 26 ++++++++++++++++++++-
 llmlingua/prompt_compressor.py | 42 ++++++++++++----------------------
 2 files changed, 39 insertions(+), 29 deletions(-)
diff --git a/README.md b/README.md
index c886e80..eaeab6a 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,31 @@ llm_lingua = PromptCompressor("microsoft/phi-2")
 llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
 ```
 
-#### 3. **Learning More:**
+#### 3. **Using (Long)LLMLingua for Structured Prompt Compression:**
+
+You can split the text into different sections and define whether a section should be compressed and the rate of compression to apply.
+When using this method, each element of context should be segmented using one or more non-nested `<llmlingua></llmlingua>` tags. Each `<llmlingua>` tag can include optional parameters `rate` and `compress` (e.g., `<llmlingua, rate=0.3, compress=True>`), indicating the compression strategy for that segment.
+
+```python
+from llmlingua import PromptCompressor
+
+llm_lingua = PromptCompressor()
+structured_prompt = """<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.</llmlingua><llmlingua, compress=False>
+Speaker 0:</llmlingua><llmlingua, rate=0.4> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.</llmlingua><llmlingua, compress=False>
+Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua><llmlingua, compress=False>
+Speaker 2:</llmlingua><llmlingua, rate=0.6> Now. I had queued up to motion, but.</llmlingua><llmlingua, compress=False>
+Speaker 4:</llmlingua><llmlingua, rate=0.6> Great that we have any public comment on this.</llmlingua>"""
+compressed_prompt = llm_lingua.structured_compress_prompt(structured_prompt, instruction="", question="", rate=0.5)
+print(compressed_prompt['compressed_prompt'])
+
+# > Speaker 4:. And can we do the functions for content? Items I believe are11,116 28,.
+# Speaker 0:1 is a from Council on Price recommendation to increaseation the fund group the City Manager Department $20 to provide a the the.1 is Councilman Super Now. the specialising and group the manager provide the of summerman manager $ a the Jazzels a communicationman. increase the group a the Little.1man Allen0 provide to Casa Kor, Sewia, of and Dave Van communication. from Mayor Council Member Mur to Ronmer. Basketball andic
+# Speaker 4: a a second time asman servedman Ring and and have
+# Speaker 2: Now. I had queued up to motion, but.
+# Speaker 4: Great that we have any public comment on this.
+```
+
+#### 4. **Learning More:**
 
 To understand how to apply LLMLingua and LongLLMLingua in real-world scenarios like RAG, Online Meetings, CoT, and Code, please refer to our [**examples**](./examples). For detailed guidance, the [**documentation**](./DOCUMENT.md) provides extensive recommendations on effectively utilizing LLMLingua.
 
diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index dce95d9..c2d3306 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -835,34 +835,23 @@ def keep_sentence(dem_idx: int, sent_keep: int):
             for idx in idxs:
                 sentence_ppl[idx] += high_priority_bonus
 
-        def sync_sentence(segments, text):
-            seg_num = len(segments)
-            new_segments = []
-            text_seen = 0
-            seg_idx, cur_seg_seen = 0, 0
-            for i, s in enumerate(text):
-                while seg_idx < seg_num and s != segments[seg_idx][cur_seg_seen]:
-                    if cur_seg_seen < len(segments[seg_idx]) - 1:
-                        cur_seg_seen += 1
-                        continue
-                    new_segments.append(text[text_seen:i])
-                    text_seen = i
-                    seg_idx += 1
-                    cur_seg_seen = 0
-                cur_seg_seen += 1
-                if seg_idx == seg_num:
+        def sync_sentence(sentences, text):
+            seen_text = 0
+            sentence_num = len(sentences)
+            new_sentences = []
+            for i, s in enumerate(sentences):
+                assert s == text[seen_text: seen_text + len(s)]
+                if i == sentence_num - 1:
+                    new_sentences.append(text[seen_text:])
                     break
-                if cur_seg_seen == len(segments[seg_idx]):
-                    new_segments.append(text[text_seen : i + 1])
-                    text_seen = i + 1
-                    seg_idx += 1
-                    cur_seg_seen = 0
-            if text_seen < len(text):
-                new_segments.append(text[text_seen:])
-            assert len("".join(new_segments)) == len(text)
-            return new_segments
+                next_sentence_start = text.find(sentences[i + 1][:5], seen_text + len(s))
+                new_sentences.append(text[seen_text: next_sentence_start])
+                seen_text = next_sentence_start
+            assert "".join(new_sentences) == text
+            return new_sentences
 
         sentences = [nltk.sent_tokenize(c) for c in context]
+        sentences = [sync_sentence(s, c) for s, c in zip(sentences, context)]
         dem_g, s2de, idx = defaultdict(set), defaultdict(int), 0
         for idx_d, s in enumerate(sentences):
             for _ in s:
@@ -871,9 +860,6 @@ def sync_sentence(segments, text):
                 idx += 1
 
         if context_segs is not None:
-            context_segs = [
-                sync_sentence(s, "".join(c)) for s, c in zip(context_segs, sentences)
-            ]
             sen2seg_ratio = {}
             idx = 0
             for idx_d, sentences_each_context in enumerate(sentences):

From 39a0d521e9a7dbc5efdf60e9a97e7e163bc3db55 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Thu, 7 Mar 2024 08:39:38 +0000
Subject: [PATCH 2/4] update unittest data

---
 tests/test_llmlingua.py     | 16 +++++++++-------
 tests/test_longllmlingua.py | 14 +++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/test_llmlingua.py b/tests/test_llmlingua.py
index 59302a7..0416f1e 100644
--- a/tests/test_llmlingua.py
+++ b/tests/test_llmlingua.py
@@ -36,16 +36,18 @@ class LLMLinguaTester(unittest.TestCase):
         "id": "987654",
         "name": "John Doe",
         "isActive": "true",
-        "biography": " Doe, born in York in 1985 a renowned engineer with over the field.John from MIT and has since worked with several.He has a for developing innovative solutions and has to numerous projectsJohn is also avid and speaker at conferences, his on and their the business.In his enjoys, reading fiction and playing piano.",
+        "biography": " Doe, born in York in 1985 a renowned engineer with over in the field. John from MIT a Science and has since worked with several He has a for developing innovative solutions and has to numerous projects John is also avid and speaker at conferences, his on technologies and their the business. In his enjoys, reading fiction and playing piano.",
         "employmentHistory": [
             {
             "company": "TechCorp",
-            "role": "Senior",
-            "description": " John was for leading of engineers and of scalable.He in the of cloud technologies, significantly the of their digital operations."},
+            "role": "Senior Engineer",
+            "description": " John was for leading of engineers and of scalable He in the of cloud technologies company, significantly the of their digital operations."
+            },
             {
             "company": "Innovatech",
             "role": "Lead",
-            "description": "In his John on developingedge AI and implementing learning solutions for various was in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."}
+            "description": "In his John on developingedge AI and implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."
+            }
         ],
         "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
     }"""
@@ -109,9 +111,9 @@ def test_general_structured_compress_prompt(self):
             self.JSON_COMPRESSED_PROMPT,
         )
         self.assertEqual(compressed_prompt["origin_tokens"], 318)
-        self.assertEqual(compressed_prompt["compressed_tokens"], 225)
-        self.assertEqual(compressed_prompt["ratio"], "1.4x")
-        self.assertEqual(compressed_prompt["rate"], "70.8%")
+        self.assertEqual(compressed_prompt["compressed_tokens"], 241)
+        self.assertEqual(compressed_prompt["ratio"], "1.3x")
+        self.assertEqual(compressed_prompt["rate"], "75.8%")
 
         # Multiple Stuctured Context
         compressed_prompt = self.llmlingua.structured_compress_prompt(
diff --git a/tests/test_longllmlingua.py b/tests/test_longllmlingua.py
index 9b5fc5b..7fe906c 100644
--- a/tests/test_longllmlingua.py
+++ b/tests/test_longllmlingua.py
@@ -37,17 +37,17 @@ class LongLLMLinguaTester(unittest.TestCase):
         "id": "765",
         "name": " Doe",
         "isActive": "true",
-        "biography": "John York is a software years experience in the. MIT with degree Science and has worked with several Fortune companies He developing solutions and projects is and speaker tech conferences emerging business world. In his time, enjoys, reading, playing.",
+        "biography": "John York is a software years experience in the. MIT with degree Science and has worked with several Fortune companies He developing solutions and projects is and speaker tech conferences emerging business world. ",
         "employmentHistory": [
             {
             "company": "TechCorp",
             "role": "",
-            "description": " Tech, was leading software engineers overseeing web played key the cloud technologies within the company, significantly enhancing their digital."
+            "description": " Tech, was leading software engineers overseeing web played key the cloud technologies within the company, significantly enhancing their digital operations."
             },
             {
             "company": "Innovatech",
             "role": "",
-            "description": " role developingedge AI implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."
+            "description": " his role developingedge AI implementing learning for He was developing a analytics tool transformed the company's approach to data-driven decision making."
             }
         ],
         "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
@@ -121,7 +121,7 @@ def test_general_structured_compress_prompt(self):
             [self.JSON_PROMPT],
             question=self.STRUCTURED_QUESTION,
             target_token=150,
-            use_sentence_level_filter=False,
+            use_sentence_level_filter=True,
             condition_in_question="after_condition",
             reorder_context="sort",
             dynamic_context_compression_ratio=0.4,
@@ -136,9 +136,9 @@ def test_general_structured_compress_prompt(self):
             self.JSON_COMPRESSED_PROMPT,
         )
         self.assertEqual(compressed_prompt["origin_tokens"], 329)
-        self.assertEqual(compressed_prompt["compressed_tokens"], 205)
-        self.assertEqual(compressed_prompt["ratio"], "1.6x")
-        self.assertEqual(compressed_prompt["rate"], "62.3%")
+        self.assertEqual(compressed_prompt["compressed_tokens"], 188)
+        self.assertEqual(compressed_prompt["ratio"], "1.8x")
+        self.assertEqual(compressed_prompt["rate"], "57.1%")
 
         # Multiple Stuctured Context
         compressed_prompt = self.llmlingua.structured_compress_prompt(

From cd3936239b0d3413602e7b8c90161216171bc3a4 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Mon, 11 Mar 2024 05:49:10 +0000
Subject: [PATCH 3/4] fix one-sentence bug

---
 llmlingua/prompt_compressor.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index c2d3306..33dcbd4 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -901,7 +901,10 @@ def sync_sentence(sentences, text):
         N = len(context_sentences)
         flags = list(range(len(context_sentences)))
         if len(sentence_tokens_length) == 1:
-            return context
+            segments_info = []
+            if context_segs is not None:
+                segments_info.append(sen2seg_ratio[0])
+            return context, segments_info
         if rank_method == "longllmlingua":
             sentence_ppl = [
                 self.get_condition_ppl(sentence, question, condition_in_question)
@@ -967,11 +970,6 @@ def sync_sentence(sentences, text):
                         segment_ratio.extend(sen2seg_ratio[idx + ii])
                 new_segments_info.append(segment_ratio)
             idx += len(s)
-        if context_segs is not None:
-            new_segments_info = [
-                self.concate_segment_info(segment_info)
-                for segment_info in new_segments_info
-            ]
         return res, new_segments_info
 
     def get_compressed_input(

From 3c7e2b6509b003adaf1d2d34943a0765ed99ab19 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Tue, 12 Mar 2024 10:15:49 +0000
Subject: [PATCH 4/4] update readme

---
 README.md | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index eaeab6a..f309f41 100644
--- a/README.md
+++ b/README.md
@@ -128,28 +128,20 @@ llm_lingua = PromptCompressor("microsoft/phi-2")
 llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
 ```
 
-#### 3. **Using (Long)LLMLingua for Structured Prompt Compression:**
+#### 3. **Advanced usage - Structured Prompt Compression:**
 
-You can split the text into different sections and define whether a section should be compressed and the rate of compression to apply.
-When using this method, each element of context should be segmented using one or more non-nested `<llmlingua></llmlingua>` tags. Each `<llmlingua>` tag can include optional parameters `rate` and `compress` (e.g., `<llmlingua, rate=0.3, compress=True>`), indicating the compression strategy for that segment.
+Split text into sections, decide on whether to compress and its rate. Use `<llmlingua></llmlingua>` tags for context segmentation, with optional rate and compress parameters.
 
 ```python
-from llmlingua import PromptCompressor
-
-llm_lingua = PromptCompressor()
 structured_prompt = """<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.</llmlingua><llmlingua, compress=False>
 Speaker 0:</llmlingua><llmlingua, rate=0.4> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.</llmlingua><llmlingua, compress=False>
-Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua><llmlingua, compress=False>
-Speaker 2:</llmlingua><llmlingua, rate=0.6> Now. I had queued up to motion, but.</llmlingua><llmlingua, compress=False>
-Speaker 4:</llmlingua><llmlingua, rate=0.6> Great that we have any public comment on this.</llmlingua>"""
+Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua>"""
 compressed_prompt = llm_lingua.structured_compress_prompt(structured_prompt, instruction="", question="", rate=0.5)
 print(compressed_prompt['compressed_prompt'])
 
-# > Speaker 4:. And can we do the functions for content? Items I believe are11,116 28,.
-# Speaker 0:1 is a from Council on Price recommendation to increaseation the fund group the City Manager Department $20 to provide a the the.1 is Councilman Super Now. the specialising and group the manager provide the of summerman manager $ a the Jazzels a communicationman. increase the group a the Little.1man Allen0 provide to Casa Kor, Sewia, of and Dave Van communication. from Mayor Council Member Mur to Ronmer. Basketball andic
-# Speaker 4: a a second time asman servedman Ring and and have
-# Speaker 2: Now. I had queued up to motion, but.
-# Speaker 4: Great that we have any public comment on this.
+# > Speaker 4:. And can we do the functions for content? Items I believe are11,,116 28,.
+# Speaker 0: a from Council on Price to increase the fund group the Manager0 provide a the the1 is Councilman Super Now. the special group the provide the summerman a the Jazzels a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.
+# Speaker 4: We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.
 ```
 
 #### 4. **Learning More:**