From 75d192e6644388259947572547183cb4b60303fe Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Wed, 24 Jan 2024 09:38:45 +0000
Subject: [PATCH 01/19] update the definition of compression ratio

---
 llmlingua/prompt_compressor.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 9e8fe99..512e7d0 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -159,7 +159,7 @@ def compress_prompt(
         context: List[str],
         instruction: str = "",
         question: str = "",
-        ratio: float = 0.5,
+        ratio: float = 2.0,
         target_token: float = -1,
         iterative_size: int = 200,
         force_context_ids: List[int] = None,
@@ -189,13 +189,16 @@ def compress_prompt(
             context (List[str]): List of context strings that form the basis of the prompt.
             instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
             question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
-            ratio (float, optional): The minimum compression ratio target to be achieved. Default is 0.5. The actual compression ratio 
-                generally exceeds the specified target, but there can be fluctuations due to differences in tokenizers. If specified, 
-                it should be a float greater than or equal to 1.0, representing the target compression ratio.
+            ratio (float, optional): The minimum compression ratio target to be achieved. The compression ratio is defined 
+                the same as in Wikipedia [Data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio):
+                .. math::\text{Compression Ratio} = \frac{\text{Uncompressed Size}}{\text{Compressed Size}}
+                Default is 2.0. The actual compression ratio generally exceeds the specified target, but there can be 
+                fluctuations due to differences in tokenizers. If specified, it should be a float greater than or equal 
+                to 1.0, representing the target compression ratio.
             target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target. 
                 The actual number of tokens after compression should generally be less than the specified target_token, but there can 
                 be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
-                the sole criterion, overriding the ratio.
+                the sole criterion, overriding the ``ratio``.
             iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
             force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
             force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
@@ -223,8 +226,8 @@ def compress_prompt(
                 - "origin_tokens" (int): The original number of tokens in the input.
                 - "compressed_tokens" (int): The number of tokens in the compressed output.
                 - "ratio" (str): The compression ratio achieved, in a human-readable format.
+                - "rate" (str): The compression rate achieved, calculated as the token number after compression divided by the original token number.
                 - "saving" (str): Estimated savings in GPT-4 token usage.
-
         """
         if not context:
             context = [" "]
@@ -258,7 +261,7 @@ def compress_prompt(
                     + question_tokens_length
                     + sum(context_tokens_length)
                 )
-                * (1 - ratio)
+                * (1 / ratio)
                 - instruction_tokens_length
                 - (question_tokens_length if concate_question else 0)
             )
@@ -351,11 +354,13 @@ def compress_prompt(
         compressed_tokens = len(encoding.encode(compressed_prompt))
         saving = (origin_tokens - compressed_tokens) * 0.06 / 1000
         ratio = 1 if compressed_tokens == 0 else origin_tokens / compressed_tokens
+        rate = 1 / ratio
         return {
             "compressed_prompt": compressed_prompt,
             "origin_tokens": origin_tokens,
             "compressed_tokens": compressed_tokens,
             "ratio": f"{ratio:.1f}x",
+            "rate": f"{rate * 100:.1f}%",
             "saving": f", Saving ${saving:.1f} in GPT-4.",
         }
 

From fb2290c45ec35333e6cb6cc4fd7b2d3e665c6c9c Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Tue, 30 Jan 2024 05:41:39 +0000
Subject: [PATCH 02/19] update structured prompt compress

---
 llmlingua/prompt_compressor.py | 447 ++++++++++++++++++++++++++++++++-
 1 file changed, 436 insertions(+), 11 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 512e7d0..a588515 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -11,7 +11,7 @@
 import nltk
 import tiktoken
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-
+import re
 
 encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
 
@@ -183,7 +183,7 @@ def compress_prompt(
         concate_question: bool = True,
     ):
         """
-        Compresses the given context, instruction and question.
+        Compresses the given context.
 
         Args:
             context (List[str]): List of context strings that form the basis of the prompt.
@@ -269,7 +269,7 @@ def compress_prompt(
         condition_in_question = condition_in_question.replace("_condition", "")
 
         if len(context) > 1 and use_context_level_filter:
-            context, dynamic_ratio = self.control_context_budget(
+            context, dynamic_ratio, _ = self.control_context_budget(
                 context,
                 context_tokens_length,
                 target_token,
@@ -286,7 +286,7 @@ def compress_prompt(
             dynamic_ratio = [0.0] * len(context)
 
         if use_sentence_level_filter:
-            context = self.control_sentence_budget(
+            context, _ = self.control_sentence_budget(
                 context,
                 target_token,
                 keep_first_sentence=keep_first_sentence,
@@ -400,6 +400,7 @@ def get_dynamic_compression_ratio(
         iterative_size: int,
         dynamic_ratio: list,
         start: int,
+        seg_info: List[List[tuple]] = None,
     ):
         def get_ratio(base: float, delta: float):
             return max(min(1, base + delta), 0)
@@ -436,6 +437,61 @@ def get_ratio(base: float, delta: float):
             res.append(last_target)
         return res
 
+    def get_structured_dynamic_compression_ratio(
+        self,
+        context: list,
+        iterative_size: int,
+        dynamic_ratio: list,
+        start: int,
+        seg_info: List[List[tuple]] = None,
+    ):
+        if start:
+            context_length = context_length[1:]
+        global_dynamic_rate, global_dynamic_compress, tmp_context = [], [], []
+        for context_idx, text in enumerate(context):
+            text_seen = 0
+            new_text = []
+            for seg_idx, (seg_len, seg_ratio, seg_compress) in enumerate(seg_info[context_idx]):
+                new_text.append(text[text_seen:text_seen + seg_len])
+                if seg_compress:
+                    global_dynamic_rate.append(1 / seg_ratio)
+                else:
+                    global_dynamic_rate.append(1.0)
+                global_dynamic_compress.append(seg_compress)
+                text_seen += seg_len
+            tmp_context.append((" " + self.tokenizer.bos_token).join(new_text))
+        tmp_context = ("\n\n " + self.tokenizer.bos_token).join(tmp_context)
+        context_input_ids = self.tokenizer(tmp_context).input_ids
+
+        assert context_input_ids.count(1) == len(global_dynamic_rate)
+        indexes_of_seperator = [i for i, v in enumerate(context_input_ids) if v == 1][1:] + [len(context_input_ids)]
+
+        res, idx, token_seen, last, last_target = [], 0, 0, 0, []
+        while idx < len(indexes_of_seperator):
+            if indexes_of_seperator[idx] - token_seen > iterative_size - last:
+                last_target.append(
+                    (iterative_size - last, global_dynamic_rate[idx])
+                )
+                res.append(last_target)
+                token_seen += iterative_size - last
+                last, last_target = 0, []
+                k = (indexes_of_seperator[idx] - token_seen) // iterative_size
+                res.extend(
+                    [[(iterative_size, global_dynamic_rate[idx])]] * k
+                )
+                token_seen += k * iterative_size
+
+            if indexes_of_seperator[idx] - token_seen:
+                last_target.append(
+                    (indexes_of_seperator[idx] - token_seen, global_dynamic_rate[idx])
+                )
+                last += indexes_of_seperator[idx] - token_seen
+            token_seen = indexes_of_seperator[idx] + 1
+            idx += 1
+        if last_target:
+            res.append(last_target)
+        return res
+
     def control_context_budget(
         self,
         context: List[str],
@@ -498,7 +554,7 @@ def control_context_budget(
             dynamic_ratio = [0.0] * len(used)
 
         res = [context[idx] for idx in used if idx < len(context)]
-        return res, dynamic_ratio
+        return res, dynamic_ratio, used
 
     def control_sentence_budget(
         self,
@@ -512,12 +568,42 @@ def control_sentence_budget(
         question: str = "",
         condition_in_question: str = "none",
         rank_method: str = "longllmlingua",
+        context_segs: List[List[str]] = None,
+        context_segs_ratio: List[List[float]] = None,
+        context_segs_compress: List[List[bool]] = None,
     ):
         def keep_sentence(dem_idx: int, sent_keep: int):
             idxs = sorted(dem_g[dem_idx], key=lambda x: sentence_ppl[x])[:sent_keep]
             for idx in idxs:
                 sentence_ppl[idx] += high_priority_bonus
 
+        def sync_sentence(segments, text):
+            seg_num = len(segments)
+            new_segments= []
+            text_seen = 0
+            seg_idx, cur_seg_seen = 0, 0
+            for i, s in enumerate(text):
+                while seg_idx < seg_num and s != segments[seg_idx][cur_seg_seen]:
+                    if cur_seg_seen < len(segments[seg_idx]) - 1:
+                        cur_seg_seen += 1
+                        continue
+                    new_segments.append(text[text_seen:i])
+                    text_seen = i
+                    seg_idx += 1
+                    cur_seg_seen = 0
+                cur_seg_seen += 1
+                if seg_idx == seg_num:
+                    break
+                if cur_seg_seen == len(segments[seg_idx]):
+                    new_segments.append(text[text_seen : i + 1])
+                    text_seen = i + 1
+                    seg_idx += 1
+                    cur_seg_seen = 0
+            if text_seen < len(text):
+                new_segments.append(text[text_seen:])
+            assert len("".join(new_segments)) == len(text)
+            return new_segments
+                 
         sentences = [nltk.sent_tokenize(c) for c in context]
         dem_g, s2de, idx = defaultdict(set), defaultdict(int), 0
         for idx_d, s in enumerate(sentences):
@@ -526,6 +612,30 @@ def keep_sentence(dem_idx: int, sent_keep: int):
                 s2de[idx] = idx_d
                 idx += 1
 
+        if context_segs is not None:
+            context_segs = [sync_sentence(s, "".join(c)) for s, c in zip(context_segs, sentences)]
+            sen2seg_ratio = {}
+            idx = 0
+            for idx_d, sentences_each_context in enumerate(sentences):
+                segments_length = [len(s) for s in context_segs[idx_d]]
+                seg_idx, cur_seg_seen = 0, 0
+                for sentence in sentences_each_context:
+                    sentence_seg_ratio = []
+                    remain = len(sentence)
+                    while remain:
+                        if segments_length[seg_idx] - cur_seg_seen <= remain:
+                            new_seg_len = segments_length[seg_idx] - cur_seg_seen
+                            sentence_seg_ratio.append((new_seg_len, context_segs_ratio[idx_d][seg_idx], context_segs_compress[idx_d][seg_idx]))
+                            seg_idx += 1
+                            cur_seg_seen = 0
+                            remain -= new_seg_len
+                        else:
+                            sentence_seg_ratio.append((remain, context_segs_ratio[idx_d][seg_idx], context_segs_compress[idx_d][seg_idx]))
+                            cur_seg_seen += remain
+                            remain = 0
+                    sen2seg_ratio[idx] = sentence_seg_ratio
+                    idx += 1
+                    
         context_sentences = [s for ii in sentences for s in ii]
         sentence_tokens_length = [
             self.get_token_length(sentence) for sentence in context_sentences
@@ -581,11 +691,22 @@ def keep_sentence(dem_idx: int, sent_keep: int):
                 break
         idx = 0
         res = []
+        new_segments_info = []
         for s in sentences:
             tmp = [jj for ii, jj in enumerate(s) if sentence_flags[idx + ii]]
             res.append("\n".join(tmp))
+            if context_segs is not None:
+                segment_ratio = []
+                for ii in range(len(s)):
+                    if sentence_flags[idx + ii]:
+                        last_element = (sen2seg_ratio[idx + ii][-1][0] + 1, sen2seg_ratio[idx + ii][-1][1], sen2seg_ratio[idx + ii][-1][2])
+                        segment_ratio.extend(sen2seg_ratio[idx + ii][:-1] + [last_element])
+                segment_ratio = segment_ratio[:-1] + [(segment_ratio[-1][0] - 1, segment_ratio[-1][1], segment_ratio[-1][2])]
+                new_segments_info.append(segment_ratio)       
             idx += len(s)
-        return res
+        if context_segs is not None:
+            new_segments_info = [self.concate_segment_info(segment_info) for segment_info in new_segments_info]
+        return res, new_segments_info
 
     def get_compressed_input(
         self,
@@ -696,6 +817,8 @@ def get_compressed_input(
     def get_estimate_threshold_base_distribution(
         self, ppl, ratio: float, condition_flag: bool = False
     ):
+        if ratio == 1.0:
+            return float('-inf')
         ppl = ppl[ppl != 10000]
         target_token = max(0, min(len(ppl) - 1, int(len(ppl) * ratio) - 1))
         return (
@@ -716,10 +839,16 @@ def iterative_compress_prompt(
         start: int = 0,
         dynamic_ratio: list = None,
         condition_compare: bool = False,
-    ):
-        iterative_ratios = self.get_dynamic_compression_ratio(
-            context, target_token, iterative_size, dynamic_ratio, start
-        )
+        segments_info: List[List[tuple]] = None,
+    ):  
+        if segments_info is None:
+            iterative_ratios = self.get_dynamic_compression_ratio(
+                context, target_token, iterative_size, dynamic_ratio, start
+            )
+        else:
+            iterative_ratios = self.get_structured_dynamic_compression_ratio(
+                context, iterative_size, dynamic_ratio, start, segments_info
+            )
         context = "\n\n".join(context)
         tokenized_text = self.tokenizer(context, return_tensors="pt")
         input_ids = tokenized_text["input_ids"].to(self.device)
@@ -765,7 +894,7 @@ def iterative_compress_prompt(
         while end <= compressed_input_ids.shape[1]:
             if end > self.max_position_embeddings and past_key_values is not None:
                 # KV-Cache Compression
-                e, s = end - self.max_position_embeddings, self.cache_bos_num
+                e, s = end - self.max_position_embeddings, min(self.cache_bos_num + start, self.max_position_embeddings)
                 if pop_compressed_input_ids is None:
                     pop_compressed_input_ids = compressed_input_ids[:, :e]
                 else:
@@ -1243,3 +1372,299 @@ def get_distance_longllmlingua(corpus, query):
         elif rank_method == "cohere":
             method = get_distance_cohere
         return method(context, question)
+
+    def structured_compress_prompt(
+        self,
+        context: List[str],
+        instruction: str = "",
+        question: str = "",
+        ratio: float = 2.0,
+        target_token: float = -1,
+        iterative_size: int = 200,
+        force_context_ids: List[int] = None,
+        force_context_number: int = None,
+        use_sentence_level_filter: bool = False,
+        use_context_level_filter: bool = True,
+        use_token_level_filter: bool = True,
+        keep_split: bool = False,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        context_budget: str = "+100",
+        token_budget_ratio: float = 1.4,
+        condition_in_question: str = "none",
+        reorder_context: str = "original",
+        dynamic_context_compression_ratio: float = 0.0,
+        condition_compare: bool = False,
+        add_instruction: bool = False,
+        rank_method: str = "llmlingua",
+        concate_question: bool = True,
+    ):
+        """
+        Compresses the given prompt context based on a specified structure.
+
+        Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags. Each '<llmlingua>' tag 
+        can include optional parameters 'ratio' and 'compress' (e.g., '<llmlingua, ratio=1.5, compress=True>'), 
+        indicating the compression ratio for that segment. Default values are 'ratio=2.0' and 'compress=True'. 
+        When 'compress' is set to False, it overrides the 'ratio' parameter, resulting in no compression for that segment.
+
+        Args:
+            context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
+            instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
+            question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
+            ratio (float, optional): The minimum compression ratio target to be achieved. The compression ratio is defined 
+                the same as in Wikipedia [Data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio):
+                .. math::\text{Compression Ratio} = \frac{\text{Uncompressed Size}}{\text{Compressed Size}}
+                Default is 2.0. The actual compression ratio generally exceeds the specified target, but there can be 
+                fluctuations due to differences in tokenizers. If specified, it should be a float greater than or equal 
+                to 1.0, representing the target compression ratio.
+            target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target. 
+                The actual number of tokens after compression should generally be less than the specified target_token, but there can 
+                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
+                the sole criterion, overriding the ``ratio``.
+            iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
+            use_sentence_level_filter (bool, optional): Whether to apply sentence-level filtering in compression. Default is False.
+            use_context_level_filter (bool, optional): Whether to apply context-level filtering in compression. Default is True.
+            use_token_level_filter (bool, optional): Whether to apply token-level filtering in compression. Default is True.
+            keep_split (bool, optional): Whether to preserve the original separators without compression. Default is False.
+            keep_first_sentence (int, optional): Number of sentences to forcibly preserve from the start of the context. Default is 0.
+            keep_last_sentence (int, optional): Number of sentences to forcibly preserve from the end of the context. Default is 0.
+            keep_sentence_number (int, optional): Total number of sentences to forcibly preserve in the compression. Default is 0.
+            high_priority_bonus (int, optional): Bonus score for high-priority sentences to influence their likelihood of being retained. Default is 100.
+            context_budget (str, optional): Token budget for the context-level filtering, expressed as a string to indicate flexibility. Default is "+100".
+            token_budget_ratio (float, optional): Ratio to adjust token budget during sentence-level filtering. Default is 1.4.
+            condition_in_question (str, optional): Specific condition to apply to question in the context. Default is "none".
+            reorder_context (str, optional): Strategy for reordering context in the compressed result. Default is "original".
+            dynamic_context_compression_ratio (float, optional): Ratio for dynamically adjusting context compression. Default is 0.0.
+            condition_compare (bool, optional): Whether to enable condition comparison during token-level compression. Default is False.
+            add_instruction (bool, optional): Whether to add the instruction to the prompt prefix. Default is False.
+            rank_method (str, optional): Method used for ranking elements during compression. Default is "llmlingua".
+            concate_question (bool, optional): Whether to concatenate the question to the compressed prompt. Default is True.
+
+        Returns:
+            dict: A dictionary containing:
+                - "compressed_prompt" (str): The resulting compressed prompt.
+                - "origin_tokens" (int): The original number of tokens in the input.
+                - "compressed_tokens" (int): The number of tokens in the compressed output.
+                - "ratio" (str): The compression ratio achieved, in a human-readable format.
+                - "rate" (str): The compression rate achieved, calculated as the token number after compression divided by the original token number.
+                - "saving" (str): Estimated savings in GPT-4 token usage.
+        """
+        if not context:
+            context = [" "]
+        if isinstance(context, str):
+            context = [context]
+        context, context_segs, context_segs_ratio, context_segs_compress = self.segment_structured_context(context)
+        
+        assert not (
+            rank_method == "longllmlingua" and not question
+        ), "In the LongLLMLingua, it is necessary to set a question."
+        if condition_compare and "_condition" not in condition_in_question:
+            condition_in_question += "_condition"
+        if rank_method == "longllmlingua":
+            if condition_in_question == "none":
+                condition_in_question = "after"
+        elif rank_method == "llmlingua":
+            condition_in_question = (
+                "none"
+                if "_condition" not in condition_in_question
+                else "none_condition"
+            )
+        origin_tokens = len(
+            encoding.encode("\n\n".join([instruction] + context + [question]).strip())
+        )
+        context_tokens_length = [self.get_token_length(c) for c in context]
+        instruction_tokens_length, question_tokens_length = self.get_token_length(
+            instruction
+        ), self.get_token_length(question)
+        if target_token == -1:
+            target_token = (
+                (
+                    instruction_tokens_length
+                    + question_tokens_length
+                    + sum(context_tokens_length)
+                )
+                * (1 / ratio)
+                - instruction_tokens_length
+                - (question_tokens_length if concate_question else 0)
+            )
+        
+        segment_comprehensive_rate = (
+            sum(
+                sum(
+                    [
+                        self.get_token_length(seg_text) / seg_ratio
+                        for seg_text, seg_ratio, _ in zip(
+                            context_segs[context_idx], 
+                            context_segs_ratio[context_idx], 
+                            context_segs_compress[context_idx]
+                        )
+                    ]
+                )
+                for context_idx in range(len(context))
+            ) / self.get_token_length("\n\n".join(context))
+        )
+        global_compression_rate = target_token / self.get_token_length("\n\n".join(context))
+        
+        assert abs(segment_comprehensive_rate - global_compression_rate) < 0.1, \
+            f"The comprehensive compression rate of each segment, {segment_comprehensive_rate}, does not match the target compression ratio, {global_compression_rate}."
+
+        condition_flag = "_condition" in condition_in_question
+        condition_in_question = condition_in_question.replace("_condition", "")
+
+        if len(context) > 1 and use_context_level_filter:
+            context, dynamic_ratio, context_used = self.control_context_budget(
+                context,
+                context_tokens_length,
+                target_token,
+                force_context_ids,
+                force_context_number,
+                question,
+                condition_in_question,
+                reorder_context=reorder_context,
+                dynamic_context_compression_ratio=dynamic_context_compression_ratio,
+                rank_method=rank_method,
+                context_budget=context_budget,
+            )
+            context_segs = [context_segs[idx] for idx in context_used]
+            context_segs_ratio = [context_segs_ratio[idx] for idx in context_used]
+            context_segs_compress = [context_segs_compress[idx] for idx in context_used]
+        else:
+            dynamic_ratio = [0.0] * len(context)
+
+        if use_sentence_level_filter:
+            context, segments_info = self.control_sentence_budget(
+                context,
+                target_token,
+                keep_first_sentence=keep_first_sentence,
+                keep_last_sentence=keep_last_sentence,
+                keep_sentence_number=keep_sentence_number,
+                high_priority_bonus=high_priority_bonus,
+                token_budget_ratio=token_budget_ratio,
+                question=question,
+                condition_in_question=condition_in_question,
+                rank_method=rank_method,
+                context_segs=context_segs,
+                context_segs_ratio=context_segs_ratio,
+                context_segs_compress=context_segs_compress,
+            )
+        else:
+            segments_info = []
+            for context_idx in range(len(context)):
+                segments_info.append([(len(seg_text), seg_ratio, seg_compress) for seg_text, seg_ratio, seg_compress in zip(context_segs[context_idx], context_segs_ratio[context_idx], context_segs_compress[context_idx])])
+            segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
+        
+        if condition_flag:
+            prefix = question + "\n\n" + instruction if add_instruction else question
+            if (
+                self.get_token_length(prefix) + 2 + iterative_size * 2
+                > self.max_position_embeddings
+            ):
+                tokens = self.tokenizer(prefix, add_special_tokens=False).input_ids
+                prefix = self.tokenizer.decode(
+                    tokens[: self.prefix_bos_num]
+                    + tokens[
+                        len(tokens)
+                        - self.max_position_embeddings
+                        + 2
+                        + self.prefix_bos_num
+                        + 2 * iterative_size :
+                    ]
+                )
+            start = self.get_token_length(prefix) + 2
+            context = [prefix] + context
+        else:
+            start = 0
+
+        if use_token_level_filter:
+            context = self.iterative_compress_prompt(
+                context,
+                target_token,
+                iterative_size=iterative_size,
+                keep_split=keep_split,
+                start=start,
+                dynamic_ratio=dynamic_ratio,
+                condition_compare=condition_compare,
+                segments_info=segments_info,
+            )
+            compressed_prompt = (
+                self.tokenizer.batch_decode(context[0])[0]
+                .replace("<s> ", "")
+                .replace("<s>", "")
+            )
+        else:
+            if condition_flag:
+                context = context[1:]
+            compressed_prompt = "\n\n".join(context)
+
+        res = []
+        if instruction:
+            res.append(instruction)
+        if compressed_prompt.strip():
+            res.append(compressed_prompt)
+        if question and concate_question:
+            res.append(question)
+
+        compressed_prompt = "\n\n".join(res)
+
+        compressed_tokens = len(encoding.encode(compressed_prompt))
+        saving = (origin_tokens - compressed_tokens) * 0.06 / 1000
+        ratio = 1 if compressed_tokens == 0 else origin_tokens / compressed_tokens
+        rate = 1 / ratio
+        return {
+            "compressed_prompt": compressed_prompt,
+            "origin_tokens": origin_tokens,
+            "compressed_tokens": compressed_tokens,
+            "ratio": f"{ratio:.1f}x",
+            "rate": f"{rate * 100:.1f}%",
+            "saving": f", Saving ${saving:.1f} in GPT-4.",
+        }
+
+    def segment_structured_context(
+        self, 
+        context: List[str],
+        ):
+        new_context, context_segs, context_segs_ratio, context_segs_compress = [], [], [], []
+        for text in context:
+            if not text.startswith("<llmlingua"):
+                text = "<llmlingua>" + text
+            if not text.endswith("</llmlingua>"):
+                text = text + "</llmlingua>"
+            
+            # Regular expression to match <llmlingua, ratio=x, compress=y>content</llmlingua>, allowing ratio and compress in any order
+            pattern = r"<llmlingua\s*(?:,\s*ratio\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*ratio\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>([^<]+)</llmlingua>"
+            matches = re.findall(pattern, text)
+
+            # Extracting segment contents
+            segments = [match[4] for match in matches]
+
+            # Extracting ratio and compress, considering their possible positions
+            segs_ratio = [float(match[0]) if match[0] else (float(match[2]) if match[2] else None) for match in matches]
+            segs_compress = [(match[1] == 'True' if match[1] else (match[3] == 'True' if match[3] else None)) for match in matches]
+            
+            segs_compress = [compress if compress is not None else True for compress in segs_compress]
+            segs_ratio = [ratio if ratio else (2.0 if compress else 1.0) for ratio, compress in zip(segs_ratio, segs_compress)]
+            assert len(segments) == len(segs_ratio) == len(segs_compress), "The number of segments, ratios, and compress flags should be the same."
+
+            new_context.append("".join(segments))
+            context_segs.append(segments)
+            context_segs_ratio.append(segs_ratio)
+            context_segs_compress.append(segs_compress)
+
+
+        return new_context, context_segs, context_segs_ratio, context_segs_compress
+
+    def concate_segment_info(
+        self, 
+        segment_info: List[List[tuple]],
+        ):
+        new_segment_info = []
+        for i, (seg_len, seg_ratio, seg_compress) in enumerate(segment_info):
+            if new_segment_info and new_segment_info[-1][1] == seg_ratio and new_segment_info[-1][2] == seg_compress:
+                new_segment_info[-1] = (new_segment_info[-1][0] + seg_len, seg_ratio, seg_compress)
+            else:
+                new_segment_info.append((seg_len, seg_ratio, seg_compress))
+        return new_segment_info
\ No newline at end of file

From 8e31e10cf22438e4bd89c129598e4e7b1854a294 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Wed, 31 Jan 2024 18:20:03 +0000
Subject: [PATCH 03/19] update structured prompt compress for llama

---
 llmlingua/prompt_compressor.py | 109 ++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 35 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index a588515..2f0ce05 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -446,51 +446,90 @@ def get_structured_dynamic_compression_ratio(
         seg_info: List[List[tuple]] = None,
     ):
         if start:
-            context_length = context_length[1:]
+            context = context[1:]
         global_dynamic_rate, global_dynamic_compress, tmp_context = [], [], []
         for context_idx, text in enumerate(context):
             text_seen = 0
-            new_text = []
             for seg_idx, (seg_len, seg_ratio, seg_compress) in enumerate(seg_info[context_idx]):
-                new_text.append(text[text_seen:text_seen + seg_len])
+                seg_text = text[text_seen : text_seen + seg_len]
+                if seg_idx == len(seg_info[context_idx]) - 1 and context_idx != len(context) - 1:
+                    seg_text += '\n\n'
+                tmp_context.append(seg_text)
                 if seg_compress:
                     global_dynamic_rate.append(1 / seg_ratio)
                 else:
                     global_dynamic_rate.append(1.0)
                 global_dynamic_compress.append(seg_compress)
                 text_seen += seg_len
-            tmp_context.append((" " + self.tokenizer.bos_token).join(new_text))
-        tmp_context = ("\n\n " + self.tokenizer.bos_token).join(tmp_context)
-        context_input_ids = self.tokenizer(tmp_context).input_ids
-
-        assert context_input_ids.count(1) == len(global_dynamic_rate)
-        indexes_of_seperator = [i for i, v in enumerate(context_input_ids) if v == 1][1:] + [len(context_input_ids)]
-
-        res, idx, token_seen, last, last_target = [], 0, 0, 0, []
-        while idx < len(indexes_of_seperator):
-            if indexes_of_seperator[idx] - token_seen > iterative_size - last:
-                last_target.append(
-                    (iterative_size - last, global_dynamic_rate[idx])
-                )
-                res.append(last_target)
-                token_seen += iterative_size - last
-                last, last_target = 0, []
-                k = (indexes_of_seperator[idx] - token_seen) // iterative_size
-                res.extend(
-                    [[(iterative_size, global_dynamic_rate[idx])]] * k
-                )
-                token_seen += k * iterative_size
-
-            if indexes_of_seperator[idx] - token_seen:
-                last_target.append(
-                    (indexes_of_seperator[idx] - token_seen, global_dynamic_rate[idx])
-                )
-                last += indexes_of_seperator[idx] - token_seen
-            token_seen = indexes_of_seperator[idx] + 1
-            idx += 1
-        if last_target:
-            res.append(last_target)
-        return res
+        origin_text = '\n\n'.join(context)
+        assert len("".join(tmp_context)) == len(origin_text)
+        dynamic_compression_ratio = self.token_segment(origin_text, iterative_size, tmp_context, global_dynamic_rate, global_dynamic_compress)
+        return dynamic_compression_ratio
+
+    def token_segment(self, text, iterative_size, segments, global_dynamic_rate, global_dynamic_compress):
+        assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
+        context_input_ids = self.tokenizer(text).input_ids
+        segments_inputs_ids = self.tokenizer((" " + self.tokenizer.bos_token).join(segments)).input_ids
+        
+        segments_token_len = [0]
+        seg_token_len = 0
+        for i, token_id in enumerate(segments_inputs_ids):
+            token = self.tokenizer.convert_ids_to_tokens(token_id)
+            if (token_id == self.tokenizer.bos_token_id) and i:
+                segments_token_len.append(seg_token_len + segments_token_len[-1])
+                seg_token_len = 0
+                continue
+            seg_token_len += len(token)
+        if seg_token_len:
+            segments_token_len.append(seg_token_len + segments_token_len[-1])
+        
+        assert segments_token_len[-1] == sum([len(self.tokenizer.convert_ids_to_tokens(id)) for id in context_input_ids])
+        
+        
+        token_seen, segment_id, origin_len = 0, 0, 0
+        dynamic_compression_ratio, local_compresssion_ratio = [], []
+        
+        for i, token_id in enumerate(context_input_ids):
+            token_len = len(self.tokenizer.convert_ids_to_tokens(token_id))
+            if origin_len + token_len > segments_token_len[segment_id + 1]:
+                last_ratio = global_dynamic_rate[segment_id]
+                possible_ratio, possible_compress = [], []
+                while origin_len + token_len > segments_token_len[segment_id + 1]:
+                    possible_ratio.append(global_dynamic_rate[segment_id])
+                    possible_compress.append(global_dynamic_compress[segment_id])
+                    segment_id += 1
+                possible_ratio.append(global_dynamic_rate[segment_id])
+                possible_compress.append(global_dynamic_compress[segment_id])
+                if False in possible_compress:
+                    new_ratio = 1.0
+                else:
+                    new_ratio = min(possible_ratio)
+                if new_ratio == last_ratio:
+                    local_compresssion_ratio.append((i - token_seen + 1, last_ratio))
+                    token_seen = i + 1
+                elif new_ratio != global_dynamic_rate[segment_id]:
+                    local_compresssion_ratio.append((i - token_seen, last_ratio))
+                    local_compresssion_ratio.append((1, new_ratio))
+                else:
+                    local_compresssion_ratio.append((i - token_seen, last_ratio))
+                    token_seen = i
+            if (i + 1) % iterative_size == 0:
+                if token_seen != i + 1:
+                    local_compresssion_ratio.append((i - token_seen + 1, global_dynamic_rate[segment_id]))
+                dynamic_compression_ratio.append(local_compresssion_ratio[:])
+                token_seen = i + 1
+                local_compresssion_ratio = []
+            if origin_len + token_len == segments_token_len[segment_id + 1]:
+                if token_seen != i + 1:
+                    local_compresssion_ratio.append((i - token_seen + 1, global_dynamic_rate[segment_id]))
+                token_seen = i + 1
+                segment_id += 1
+            origin_len += token_len
+        if local_compresssion_ratio:
+            dynamic_compression_ratio.append(local_compresssion_ratio)
+
+        assert len(context_input_ids) == sum([sum([x[0] for x in l]) for l in dynamic_compression_ratio])
+        return dynamic_compression_ratio
 
     def control_context_budget(
         self,

From 992df86c92ef259be4441c3d05e84d60492fdb4e Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Mon, 5 Feb 2024 06:16:49 +0000
Subject: [PATCH 04/19] update structured prompt compress for llama and gpt2

---
 llmlingua/prompt_compressor.py | 106 +++++++++++++--------------------
 1 file changed, 43 insertions(+), 63 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 2f0ce05..c24cf30 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -468,68 +468,48 @@ def get_structured_dynamic_compression_ratio(
 
     def token_segment(self, text, iterative_size, segments, global_dynamic_rate, global_dynamic_compress):
         assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
-        context_input_ids = self.tokenizer(text).input_ids
-        segments_inputs_ids = self.tokenizer((" " + self.tokenizer.bos_token).join(segments)).input_ids
-        
-        segments_token_len = [0]
-        seg_token_len = 0
-        for i, token_id in enumerate(segments_inputs_ids):
-            token = self.tokenizer.convert_ids_to_tokens(token_id)
-            if (token_id == self.tokenizer.bos_token_id) and i:
-                segments_token_len.append(seg_token_len + segments_token_len[-1])
-                seg_token_len = 0
-                continue
-            seg_token_len += len(token)
-        if seg_token_len:
-            segments_token_len.append(seg_token_len + segments_token_len[-1])
-        
-        assert segments_token_len[-1] == sum([len(self.tokenizer.convert_ids_to_tokens(id)) for id in context_input_ids])
-        
-        
-        token_seen, segment_id, origin_len = 0, 0, 0
-        dynamic_compression_ratio, local_compresssion_ratio = [], []
-        
-        for i, token_id in enumerate(context_input_ids):
-            token_len = len(self.tokenizer.convert_ids_to_tokens(token_id))
-            if origin_len + token_len > segments_token_len[segment_id + 1]:
-                last_ratio = global_dynamic_rate[segment_id]
-                possible_ratio, possible_compress = [], []
-                while origin_len + token_len > segments_token_len[segment_id + 1]:
-                    possible_ratio.append(global_dynamic_rate[segment_id])
-                    possible_compress.append(global_dynamic_compress[segment_id])
-                    segment_id += 1
-                possible_ratio.append(global_dynamic_rate[segment_id])
-                possible_compress.append(global_dynamic_compress[segment_id])
-                if False in possible_compress:
-                    new_ratio = 1.0
-                else:
-                    new_ratio = min(possible_ratio)
-                if new_ratio == last_ratio:
-                    local_compresssion_ratio.append((i - token_seen + 1, last_ratio))
-                    token_seen = i + 1
-                elif new_ratio != global_dynamic_rate[segment_id]:
-                    local_compresssion_ratio.append((i - token_seen, last_ratio))
-                    local_compresssion_ratio.append((1, new_ratio))
-                else:
-                    local_compresssion_ratio.append((i - token_seen, last_ratio))
-                    token_seen = i
+        assert text == "".join(segments)
+        text_input_ids = self.tokenizer(text, add_special_tokens=False).input_ids
+        decode_window = 3
+        seg_idx, seg_seen, token_seen_num, last_rate = 0, 0, 0, -1
+        dynamic_compression_rate, local_compresssion_rate = [], []
+        for i in range(len(text_input_ids)):
+            if i < decode_window:
+                id_pre, id_cur = text_input_ids[: i], text_input_ids[: i + 1]
+            else:
+                id_pre, id_cur = text_input_ids[i - decode_window + 1: i], text_input_ids[i - decode_window + 1: i + 1]
+            cur_word = self.tokenizer.decode(id_cur)[len(self.tokenizer.decode(id_pre)):]
+            cur_word_len = len(cur_word)
+            if cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen:
+                possible_rate, possible_compress = [], []
+                while cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen:
+                    possible_rate.append(global_dynamic_rate[seg_idx])
+                    possible_compress.append(global_dynamic_compress[seg_idx])
+                    cur_word_len -= len(segments[seg_idx]) - seg_seen
+                    seg_idx += 1
+                    seg_seen = 0
+                if cur_word_len:
+                    possible_rate.append(global_dynamic_rate[seg_idx])
+                    possible_compress.append(global_dynamic_compress[seg_idx])
+                new_rate = 1.0 if False in possible_compress else min(possible_rate)
+            else:
+                new_rate = global_dynamic_rate[seg_idx]
+            if new_rate != last_rate and i - token_seen_num:
+                local_compresssion_rate.append((i - token_seen_num, last_rate))
+                token_seen_num = i
+            last_rate = new_rate
+            seg_seen += cur_word_len
             if (i + 1) % iterative_size == 0:
-                if token_seen != i + 1:
-                    local_compresssion_ratio.append((i - token_seen + 1, global_dynamic_rate[segment_id]))
-                dynamic_compression_ratio.append(local_compresssion_ratio[:])
-                token_seen = i + 1
-                local_compresssion_ratio = []
-            if origin_len + token_len == segments_token_len[segment_id + 1]:
-                if token_seen != i + 1:
-                    local_compresssion_ratio.append((i - token_seen + 1, global_dynamic_rate[segment_id]))
-                token_seen = i + 1
-                segment_id += 1
-            origin_len += token_len
-        if local_compresssion_ratio:
-            dynamic_compression_ratio.append(local_compresssion_ratio)
-
-        assert len(context_input_ids) == sum([sum([x[0] for x in l]) for l in dynamic_compression_ratio])
-        return dynamic_compression_ratio
+                if token_seen_num != i + 1:
+                    local_compresssion_rate.append((i + 1 - token_seen_num, last_rate))
+                    token_seen_num = i + 1
+                dynamic_compression_rate.append(local_compresssion_rate[:])
+                local_compresssion_rate = []
+        if token_seen_num != len(text_input_ids):
+            local_compresssion_rate.append((len(text_input_ids) - token_seen_num, last_rate))
+        if local_compresssion_rate != []:
+            dynamic_compression_rate.append(local_compresssion_rate[:])
+        return dynamic_compression_rate
 
     def control_context_budget(
         self,
@@ -889,7 +869,7 @@ def iterative_compress_prompt(
                 context, iterative_size, dynamic_ratio, start, segments_info
             )
         context = "\n\n".join(context)
-        tokenized_text = self.tokenizer(context, return_tensors="pt")
+        tokenized_text = self.tokenizer(context, return_tensors="pt", add_special_tokens=False)
         input_ids = tokenized_text["input_ids"].to(self.device)
         attention_mask = tokenized_text["attention_mask"].to(self.device)
 
@@ -1053,7 +1033,7 @@ def iterative_compress_prompt(
                 else:
                     threshold = self.get_estimate_threshold_base_distribution(
                         loss, ratio, False
-                    )
+                            )
 
                 (
                     compressed_input_ids,

From 81acc9cb2db40541d2436b5578c7b43da4135c9b Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Mon, 5 Feb 2024 06:27:29 +0000
Subject: [PATCH 05/19] add variable type declaration

---
 llmlingua/prompt_compressor.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index c24cf30..951c12d 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -466,7 +466,14 @@ def get_structured_dynamic_compression_ratio(
         dynamic_compression_ratio = self.token_segment(origin_text, iterative_size, tmp_context, global_dynamic_rate, global_dynamic_compress)
         return dynamic_compression_ratio
 
-    def token_segment(self, text, iterative_size, segments, global_dynamic_rate, global_dynamic_compress):
+    def token_segment(
+        self, 
+        text: str, 
+        iterative_size: int, 
+        segments: List[str], 
+        global_dynamic_rate: List[float], 
+        global_dynamic_compress: List[bool],
+    ):
         assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
         assert text == "".join(segments)
         text_input_ids = self.tokenizer(text, add_special_tokens=False).input_ids

From 46343ca8f51e3b253780a42f885196668d471531 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Mon, 5 Feb 2024 08:56:41 +0000
Subject: [PATCH 06/19] merge part of structured_compress_prompt into
 compress_prompt

---
 llmlingua/prompt_compressor.py | 180 +++++++++------------------------
 1 file changed, 50 insertions(+), 130 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 951c12d..eb24e77 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -181,6 +181,9 @@ def compress_prompt(
         add_instruction: bool = False,
         rank_method: str = "llmlingua",
         concate_question: bool = True,
+        context_segs: List[str] = None, 
+        context_segs_ratio: List[float] = None, 
+        context_segs_compress: List[bool] = None,
     ):
         """
         Compresses the given context.
@@ -269,7 +272,7 @@ def compress_prompt(
         condition_in_question = condition_in_question.replace("_condition", "")
 
         if len(context) > 1 and use_context_level_filter:
-            context, dynamic_ratio, _ = self.control_context_budget(
+            context, dynamic_ratio, context_used = self.control_context_budget(
                 context,
                 context_tokens_length,
                 target_token,
@@ -282,11 +285,15 @@ def compress_prompt(
                 rank_method=rank_method,
                 context_budget=context_budget,
             )
+            if context_segs is not None:
+                context_segs = [context_segs[idx] for idx in context_used]
+                context_segs_ratio = [context_segs_ratio[idx] for idx in context_used]
+                context_segs_compress = [context_segs_compress[idx] for idx in context_used]
         else:
             dynamic_ratio = [0.0] * len(context)
 
         if use_sentence_level_filter:
-            context, _ = self.control_sentence_budget(
+            context, segments_info = self.control_sentence_budget(
                 context,
                 target_token,
                 keep_first_sentence=keep_first_sentence,
@@ -297,7 +304,17 @@ def compress_prompt(
                 question=question,
                 condition_in_question=condition_in_question,
                 rank_method=rank_method,
+                context_segs=context_segs,
+                context_segs_ratio=context_segs_ratio,
+                context_segs_compress=context_segs_compress,
             )
+        elif context_segs is not None:
+            segments_info = []
+            for context_idx in range(len(context)):
+                segments_info.append([(len(seg_text), seg_ratio, seg_compress) for seg_text, seg_ratio, seg_compress in zip(context_segs[context_idx], context_segs_ratio[context_idx], context_segs_compress[context_idx])])
+            segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
+        else:
+            segments_info = None
 
         if condition_flag:
             prefix = question + "\n\n" + instruction if add_instruction else question
@@ -330,6 +347,7 @@ def compress_prompt(
                 start=start,
                 dynamic_ratio=dynamic_ratio,
                 condition_compare=condition_compare,
+                segments_info=segments_info,
             )
             compressed_prompt = (
                 self.tokenizer.batch_decode(context[0])[0]
@@ -1485,23 +1503,6 @@ def structured_compress_prompt(
             context = [context]
         context, context_segs, context_segs_ratio, context_segs_compress = self.segment_structured_context(context)
         
-        assert not (
-            rank_method == "longllmlingua" and not question
-        ), "In the LongLLMLingua, it is necessary to set a question."
-        if condition_compare and "_condition" not in condition_in_question:
-            condition_in_question += "_condition"
-        if rank_method == "longllmlingua":
-            if condition_in_question == "none":
-                condition_in_question = "after"
-        elif rank_method == "llmlingua":
-            condition_in_question = (
-                "none"
-                if "_condition" not in condition_in_question
-                else "none_condition"
-            )
-        origin_tokens = len(
-            encoding.encode("\n\n".join([instruction] + context + [question]).strip())
-        )
         context_tokens_length = [self.get_token_length(c) for c in context]
         instruction_tokens_length, question_tokens_length = self.get_token_length(
             instruction
@@ -1517,7 +1518,6 @@ def structured_compress_prompt(
                 - instruction_tokens_length
                 - (question_tokens_length if concate_question else 0)
             )
-        
         segment_comprehensive_rate = (
             sum(
                 sum(
@@ -1538,117 +1538,37 @@ def structured_compress_prompt(
         assert abs(segment_comprehensive_rate - global_compression_rate) < 0.1, \
             f"The comprehensive compression rate of each segment, {segment_comprehensive_rate}, does not match the target compression ratio, {global_compression_rate}."
 
-        condition_flag = "_condition" in condition_in_question
-        condition_in_question = condition_in_question.replace("_condition", "")
-
-        if len(context) > 1 and use_context_level_filter:
-            context, dynamic_ratio, context_used = self.control_context_budget(
-                context,
-                context_tokens_length,
-                target_token,
-                force_context_ids,
-                force_context_number,
-                question,
-                condition_in_question,
-                reorder_context=reorder_context,
-                dynamic_context_compression_ratio=dynamic_context_compression_ratio,
-                rank_method=rank_method,
-                context_budget=context_budget,
-            )
-            context_segs = [context_segs[idx] for idx in context_used]
-            context_segs_ratio = [context_segs_ratio[idx] for idx in context_used]
-            context_segs_compress = [context_segs_compress[idx] for idx in context_used]
-        else:
-            dynamic_ratio = [0.0] * len(context)
-
-        if use_sentence_level_filter:
-            context, segments_info = self.control_sentence_budget(
-                context,
-                target_token,
-                keep_first_sentence=keep_first_sentence,
-                keep_last_sentence=keep_last_sentence,
-                keep_sentence_number=keep_sentence_number,
-                high_priority_bonus=high_priority_bonus,
-                token_budget_ratio=token_budget_ratio,
-                question=question,
-                condition_in_question=condition_in_question,
-                rank_method=rank_method,
-                context_segs=context_segs,
-                context_segs_ratio=context_segs_ratio,
-                context_segs_compress=context_segs_compress,
-            )
-        else:
-            segments_info = []
-            for context_idx in range(len(context)):
-                segments_info.append([(len(seg_text), seg_ratio, seg_compress) for seg_text, seg_ratio, seg_compress in zip(context_segs[context_idx], context_segs_ratio[context_idx], context_segs_compress[context_idx])])
-            segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
+        return self.compress_prompt(
+            context, 
+            instruction, 
+            question, 
+            ratio,
+            target_token, 
+            iterative_size, 
+            force_context_ids, 
+            force_context_number,
+            use_sentence_level_filter, 
+            use_context_level_filter, 
+            use_token_level_filter, 
+            keep_split, 
+            keep_first_sentence, 
+            keep_last_sentence, 
+            keep_sentence_number, 
+            high_priority_bonus, 
+            context_budget, 
+            token_budget_ratio, 
+            condition_in_question, 
+            reorder_context, 
+            dynamic_context_compression_ratio, 
+            condition_compare, 
+            add_instruction,
+            rank_method, 
+            concate_question,
+            context_segs=context_segs, 
+            context_segs_ratio=context_segs_ratio, 
+            context_segs_compress=context_segs_compress,
+        )
         
-        if condition_flag:
-            prefix = question + "\n\n" + instruction if add_instruction else question
-            if (
-                self.get_token_length(prefix) + 2 + iterative_size * 2
-                > self.max_position_embeddings
-            ):
-                tokens = self.tokenizer(prefix, add_special_tokens=False).input_ids
-                prefix = self.tokenizer.decode(
-                    tokens[: self.prefix_bos_num]
-                    + tokens[
-                        len(tokens)
-                        - self.max_position_embeddings
-                        + 2
-                        + self.prefix_bos_num
-                        + 2 * iterative_size :
-                    ]
-                )
-            start = self.get_token_length(prefix) + 2
-            context = [prefix] + context
-        else:
-            start = 0
-
-        if use_token_level_filter:
-            context = self.iterative_compress_prompt(
-                context,
-                target_token,
-                iterative_size=iterative_size,
-                keep_split=keep_split,
-                start=start,
-                dynamic_ratio=dynamic_ratio,
-                condition_compare=condition_compare,
-                segments_info=segments_info,
-            )
-            compressed_prompt = (
-                self.tokenizer.batch_decode(context[0])[0]
-                .replace("<s> ", "")
-                .replace("<s>", "")
-            )
-        else:
-            if condition_flag:
-                context = context[1:]
-            compressed_prompt = "\n\n".join(context)
-
-        res = []
-        if instruction:
-            res.append(instruction)
-        if compressed_prompt.strip():
-            res.append(compressed_prompt)
-        if question and concate_question:
-            res.append(question)
-
-        compressed_prompt = "\n\n".join(res)
-
-        compressed_tokens = len(encoding.encode(compressed_prompt))
-        saving = (origin_tokens - compressed_tokens) * 0.06 / 1000
-        ratio = 1 if compressed_tokens == 0 else origin_tokens / compressed_tokens
-        rate = 1 / ratio
-        return {
-            "compressed_prompt": compressed_prompt,
-            "origin_tokens": origin_tokens,
-            "compressed_tokens": compressed_tokens,
-            "ratio": f"{ratio:.1f}x",
-            "rate": f"{rate * 100:.1f}%",
-            "saving": f", Saving ${saving:.1f} in GPT-4.",
-        }
-
     def segment_structured_context(
         self, 
         context: List[str],

From 85f2fda67aefe3a044feccc5802055ca51a9e227 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Wed, 7 Feb 2024 07:33:18 +0000
Subject: [PATCH 07/19] add no-compression retention to sentence and context
 level filter

---
 llmlingua/prompt_compressor.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index eb24e77..61f4a6f 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -284,6 +284,9 @@ def compress_prompt(
                 dynamic_context_compression_ratio=dynamic_context_compression_ratio,
                 rank_method=rank_method,
                 context_budget=context_budget,
+                context_segs=context_segs,
+                context_segs_ratio=context_segs_ratio,
+                context_segs_compress=context_segs_compress,
             )
             if context_segs is not None:
                 context_segs = [context_segs[idx] for idx in context_used]
@@ -549,9 +552,10 @@ def control_context_budget(
         dynamic_context_compression_ratio: float = 0.0,
         rank_method: str = "longllmlingua",
         context_budget: str = "+100",
+        context_segs: List[List[str]] = None,
+        context_segs_ratio: List[List[float]] = None,
+        context_segs_compress: List[List[bool]] = None,
     ):
-        if force_context_ids is not None:
-            return [context[ii] for ii in force_context_ids], [0] * len(force_context_ids)
         demostrations_sort = self.get_rank_results(
             context,
             question,
@@ -565,6 +569,10 @@ def control_context_budget(
         target_token = eval("target_token" + context_budget)
         res = []
         used = force_context_ids if force_context_ids is not None else []
+        if context_segs is not None:
+            for idx, _ in enumerate(context):
+                if False in context_segs_compress[idx]:
+                    used.append(idx)
 
         self.context_idxs.append([x for idx, (x, _) in enumerate(demostrations_sort)])
         for idx, _ in demostrations_sort:
@@ -733,6 +741,13 @@ def sync_sentence(segments, text):
             sentence_flags[idx] = True
             if target_token < 0:
                 break
+        
+        if context_segs is not None:
+            for idx in range(N):
+                preserved = [sen_seg_info[2] for sen_seg_info in sen2seg_ratio[idx]]
+                if False in preserved:
+                    sentence_flags[idx] = True
+
         idx = 0
         res = []
         new_segments_info = []

From 2917e5a36a08ce7d3f4200cecb8aa00ab67686d8 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Wed, 7 Feb 2024 08:02:54 +0000
Subject: [PATCH 08/19] update global ratio and global target token

---
 llmlingua/prompt_compressor.py | 66 +++++++++++++++-------------------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 61f4a6f..8f75058 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -1437,8 +1437,8 @@ def structured_compress_prompt(
         context: List[str],
         instruction: str = "",
         question: str = "",
-        ratio: float = 2.0,
-        target_token: float = -1,
+        global_ratio: float = 2.0,
+        global_target_token: float = -1,
         iterative_size: int = 200,
         force_context_ids: List[int] = None,
         force_context_number: int = None,
@@ -1465,23 +1465,30 @@ def structured_compress_prompt(
 
         Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags. Each '<llmlingua>' tag 
         can include optional parameters 'ratio' and 'compress' (e.g., '<llmlingua, ratio=1.5, compress=True>'), 
-        indicating the compression ratio for that segment. Default values are 'ratio=2.0' and 'compress=True'. 
+        indicating the compression ratio for that segment. Default values are 'ratio=global_ratio' and 'compress=True'. 
         When 'compress' is set to False, it overrides the 'ratio' parameter, resulting in no compression for that segment.
 
         Args:
             context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
             instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
             question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
-            ratio (float, optional): The minimum compression ratio target to be achieved. The compression ratio is defined 
-                the same as in Wikipedia [Data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio):
+            global_ratio (float, optional): The compression ratio is defined  the same as in Wikipedia [Data compression ratio]
+                (https://en.wikipedia.org/wiki/Data_compression_ratio):
                 .. math::\text{Compression Ratio} = \frac{\text{Uncompressed Size}}{\text{Compressed Size}}
                 Default is 2.0. The actual compression ratio generally exceeds the specified target, but there can be 
                 fluctuations due to differences in tokenizers. If specified, it should be a float greater than or equal 
-                to 1.0, representing the target compression ratio.
-            target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target. 
-                The actual number of tokens after compression should generally be less than the specified target_token, but there can 
-                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
-                the sole criterion, overriding the ``ratio``.
+                to 1.0, representing the target compression ratio. ``global_ratio``, is applicable only within the context-level filter 
+                and the sentence-level filter. In the token-level filter, the ratio for each segment overrides the global ratio. 
+                However, for segments where no specific ratio is defined, the global ratio serves as the default value. The final 
+                compression ratio of the entire text is a composite result of multiple compression ratios applied across different sections.
+            global_target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no 
+                specific target. The actual number of tokens after compression should generally be less than the specified target_token,
+                but there can be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
+                the sole criterion, overriding the ``global_ratio``. ``global_target_token``, is applicable only within the context-level
+                filter and the sentence-level filter. In the token-level filter, the ratio for each segment overrides the global target token. 
+                However, for segments where no specific ratio is defined, the global ratio calculated from global target token serves 
+                as the default value. The final target token of the entire text is a composite result of multiple compression ratios
+                applied across different sections.
             iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
             force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
             force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
@@ -1516,49 +1523,32 @@ def structured_compress_prompt(
             context = [" "]
         if isinstance(context, str):
             context = [context]
-        context, context_segs, context_segs_ratio, context_segs_compress = self.segment_structured_context(context)
-        
+            
         context_tokens_length = [self.get_token_length(c) for c in context]
         instruction_tokens_length, question_tokens_length = self.get_token_length(
             instruction
         ), self.get_token_length(question)
-        if target_token == -1:
-            target_token = (
+        if global_target_token == -1:
+            global_target_token = (
                 (
                     instruction_tokens_length
                     + question_tokens_length
                     + sum(context_tokens_length)
                 )
-                * (1 / ratio)
+                * (1 / global_ratio)
                 - instruction_tokens_length
                 - (question_tokens_length if concate_question else 0)
             )
-        segment_comprehensive_rate = (
-            sum(
-                sum(
-                    [
-                        self.get_token_length(seg_text) / seg_ratio
-                        for seg_text, seg_ratio, _ in zip(
-                            context_segs[context_idx], 
-                            context_segs_ratio[context_idx], 
-                            context_segs_compress[context_idx]
-                        )
-                    ]
-                )
-                for context_idx in range(len(context))
-            ) / self.get_token_length("\n\n".join(context))
-        )
-        global_compression_rate = target_token / self.get_token_length("\n\n".join(context))
-        
-        assert abs(segment_comprehensive_rate - global_compression_rate) < 0.1, \
-            f"The comprehensive compression rate of each segment, {segment_comprehensive_rate}, does not match the target compression ratio, {global_compression_rate}."
+        else:
+            global_ratio = global_target_token / sum(context_tokens_length)
 
+        context, context_segs, context_segs_ratio, context_segs_compress = self.segment_structured_context(context, global_ratio)
         return self.compress_prompt(
             context, 
             instruction, 
             question, 
-            ratio,
-            target_token, 
+            global_ratio,
+            global_target_token, 
             iterative_size, 
             force_context_ids, 
             force_context_number,
@@ -1587,6 +1577,7 @@ def structured_compress_prompt(
     def segment_structured_context(
         self, 
         context: List[str],
+        global_ratio: float,
         ):
         new_context, context_segs, context_segs_ratio, context_segs_compress = [], [], [], []
         for text in context:
@@ -1607,7 +1598,7 @@ def segment_structured_context(
             segs_compress = [(match[1] == 'True' if match[1] else (match[3] == 'True' if match[3] else None)) for match in matches]
             
             segs_compress = [compress if compress is not None else True for compress in segs_compress]
-            segs_ratio = [ratio if ratio else (2.0 if compress else 1.0) for ratio, compress in zip(segs_ratio, segs_compress)]
+            segs_ratio = [ratio if ratio else (global_ratio if compress else 1.0) for ratio, compress in zip(segs_ratio, segs_compress)]
             assert len(segments) == len(segs_ratio) == len(segs_compress), "The number of segments, ratios, and compress flags should be the same."
 
             new_context.append("".join(segments))
@@ -1615,7 +1606,6 @@ def segment_structured_context(
             context_segs_ratio.append(segs_ratio)
             context_segs_compress.append(segs_compress)
 
-
         return new_context, context_segs, context_segs_ratio, context_segs_compress
 
     def concate_segment_info(

From f60d052581e5b46fe0f319b90e6a08ae069854c9 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Wed, 7 Feb 2024 08:38:57 +0000
Subject: [PATCH 09/19] fix segments_info error

---
 llmlingua/prompt_compressor.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 8f75058..0aaf600 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -295,6 +295,7 @@ def compress_prompt(
         else:
             dynamic_ratio = [0.0] * len(context)
 
+        segments_info = []
         if use_sentence_level_filter:
             context, segments_info = self.control_sentence_budget(
                 context,
@@ -312,12 +313,9 @@ def compress_prompt(
                 context_segs_compress=context_segs_compress,
             )
         elif context_segs is not None:
-            segments_info = []
             for context_idx in range(len(context)):
                 segments_info.append([(len(seg_text), seg_ratio, seg_compress) for seg_text, seg_ratio, seg_compress in zip(context_segs[context_idx], context_segs_ratio[context_idx], context_segs_compress[context_idx])])
-            segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
-        else:
-            segments_info = None
+        segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
 
         if condition_flag:
             prefix = question + "\n\n" + instruction if add_instruction else question
@@ -900,7 +898,7 @@ def iterative_compress_prompt(
         condition_compare: bool = False,
         segments_info: List[List[tuple]] = None,
     ):  
-        if segments_info is None:
+        if segments_info is None or segments_info == []:
             iterative_ratios = self.get_dynamic_compression_ratio(
                 context, target_token, iterative_size, dynamic_ratio, start
             )

From 29be238643ff8dc80d686ccf5f40ff3f887b18ee Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Mon, 19 Feb 2024 10:18:10 +0000
Subject: [PATCH 10/19] change input paramete from ratio to rate

---
 llmlingua/prompt_compressor.py | 569 ++++++++++++++++++++-------------
 1 file changed, 339 insertions(+), 230 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 0aaf600..724f039 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -2,6 +2,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 
 import bisect
+import re
 from collections import defaultdict
 from typing import List
 
@@ -11,7 +12,7 @@
 import nltk
 import tiktoken
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-import re
+
 
 encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
 
@@ -20,11 +21,11 @@ class PromptCompressor:
     """
     PromptCompressor is designed for compressing prompts based on a given language model.
 
-    This class initializes with the language model and its configuration, preparing it for prompt compression tasks. 
-    The PromptCompressor class is versatile and can be adapted for various models and specific requirements in prompt processing. 
-    Users can specify different model names and configurations as needed for their particular use case.The architecture is 
-    based on the paper "LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models". Jiang, Huiqiang, Qianhui Wu, 
-    Chin-Yew Lin, Yuqing Yang, and Lili Qiu. "Llmlingua: Compressing prompts for accelerated inference of large language models." 
+    This class initializes with the language model and its configuration, preparing it for prompt compression tasks.
+    The PromptCompressor class is versatile and can be adapted for various models and specific requirements in prompt processing.
+    Users can specify different model names and configurations as needed for their particular use case.The architecture is
+    based on the paper "LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models". Jiang, Huiqiang, Qianhui Wu,
+    Chin-Yew Lin, Yuqing Yang, and Lili Qiu. "Llmlingua: Compressing prompts for accelerated inference of large language models."
     arXiv preprint arXiv:2310.05736 (2023).
 
     Args:
@@ -43,6 +44,7 @@ class PromptCompressor:
     Note:
         The `PromptCompressor` class requires the Hugging Face Transformers library and an appropriate environment to load and run the models.
     """
+
     def __init__(
         self,
         model_name: str = "NousResearch/Llama-2-7b-hf",
@@ -154,12 +156,156 @@ def get_ppl(
     def __call__(self, *args, **kwargs):
         return self.compress_prompt(*args, **kwargs)
 
+    def structured_compress_prompt(
+        self,
+        context: List[str],
+        instruction: str = "",
+        question: str = "",
+        global_rate: float = 0.5,
+        global_target_token: float = -1,
+        iterative_size: int = 200,
+        force_context_ids: List[int] = None,
+        force_context_number: int = None,
+        use_sentence_level_filter: bool = False,
+        use_context_level_filter: bool = True,
+        use_token_level_filter: bool = True,
+        keep_split: bool = False,
+        keep_first_sentence: int = 0,
+        keep_last_sentence: int = 0,
+        keep_sentence_number: int = 0,
+        high_priority_bonus: int = 100,
+        context_budget: str = "+100",
+        token_budget_ratio: float = 1.4,
+        condition_in_question: str = "none",
+        reorder_context: str = "original",
+        dynamic_context_compression_ratio: float = 0.0,
+        condition_compare: bool = False,
+        add_instruction: bool = False,
+        rank_method: str = "llmlingua",
+        concate_question: bool = True,
+    ):
+        """
+        Compresses the given prompt context based on a specified structure.
+
+        Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags.
+        Each '<llmlingua>' tag can include optional parameters 'rate' and 'compress' (e.g., '<llmlingua, rate=0.3, compress=True>'),
+        indicating the compression rate for that segment. Default values are 'rate=global_rate' and 'compress=True'.
+        When 'compress' is set to False, it overrides the 'rate' parameter, resulting in no compression for that segment.
+
+        Args:
+            context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
+            instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
+            question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
+            global_rate (float, optional): The compression rate is defined the same as in paper "Language Modeling Is Compression".
+                Delétang, Grégoire, Anian Ruoss, Paul-Ambroise Duquenne, Elliot Catt, Tim Genewein, Christopher Mattern,
+                Jordi Grau-Moya et al. "Language modeling is compression." arXiv preprint arXiv:2309.10668 (2023):
+                .. math::\text{Compression Rate} = \frac{\text{Compressed Size}}{\text{Raw Size}}
+                Default is 0.5. The actual compression rate is generally lower than the specified target, but there can be
+                fluctuations due to differences in tokenizers. If specified, it should be a float less than or equal
+                to 1.0, representing the target compression rate. ``global_rate``, is applicable only within the context-level filter
+                and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global rate.
+                However, for segments where no specific rate is defined, the global rate serves as the default value. The final
+                compression rate of the entire text is a composite result of multiple compression rates applied across different sections.
+            global_target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no
+                specific target. The actual number of tokens after compression should generally be less than the specified target_token,
+                but there can be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
+                the sole criterion, overriding the ``global_rate``. ``global_target_token``, is applicable only within the context-level
+                filter and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global target token.
+                However, for segments where no specific rate is defined, the global rate calculated from global target token serves
+                as the default value. The final target token of the entire text is a composite result of multiple compression rates
+                applied across different sections.
+            iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
+            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
+            force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
+            use_sentence_level_filter (bool, optional): Whether to apply sentence-level filtering in compression. Default is False.
+            use_context_level_filter (bool, optional): Whether to apply context-level filtering in compression. Default is True.
+            use_token_level_filter (bool, optional): Whether to apply token-level filtering in compression. Default is True.
+            keep_split (bool, optional): Whether to preserve the original separators without compression. Default is False.
+            keep_first_sentence (int, optional): Number of sentences to forcibly preserve from the start of the context. Default is 0.
+            keep_last_sentence (int, optional): Number of sentences to forcibly preserve from the end of the context. Default is 0.
+            keep_sentence_number (int, optional): Total number of sentences to forcibly preserve in the compression. Default is 0.
+            high_priority_bonus (int, optional): Bonus score for high-priority sentences to influence their likelihood of being retained. Default is 100.
+            context_budget (str, optional): Token budget for the context-level filtering, expressed as a string to indicate flexibility. Default is "+100".
+            token_budget_ratio (float, optional): Ratio to adjust token budget during sentence-level filtering. Default is 1.4.
+            condition_in_question (str, optional): Specific condition to apply to question in the context. Default is "none".
+            reorder_context (str, optional): Strategy for reordering context in the compressed result. Default is "original".
+            dynamic_context_compression_ratio (float, optional): Ratio for dynamically adjusting context compression. Default is 0.0.
+            condition_compare (bool, optional): Whether to enable condition comparison during token-level compression. Default is False.
+            add_instruction (bool, optional): Whether to add the instruction to the prompt prefix. Default is False.
+            rank_method (str, optional): Method used for ranking elements during compression. Default is "llmlingua".
+            concate_question (bool, optional): Whether to concatenate the question to the compressed prompt. Default is True.
+
+        Returns:
+            dict: A dictionary containing:
+                - "compressed_prompt" (str): The resulting compressed prompt.
+                - "origin_tokens" (int): The original number of tokens in the input.
+                - "compressed_tokens" (int): The number of tokens in the compressed output.
+                - "ratio" (str): The compression ratio achieved, calculated as the original token number divided by the token number after compression.
+                - "rate" (str): The compression rate achieved, in a human-readable format.
+                - "saving" (str): Estimated savings in GPT-4 token usage.
+        """
+        if not context:
+            context = [" "]
+        if isinstance(context, str):
+            context = [context]
+
+        context_tokens_length = [self.get_token_length(c) for c in context]
+        instruction_tokens_length, question_tokens_length = self.get_token_length(
+            instruction
+        ), self.get_token_length(question)
+        if global_target_token == -1:
+            global_target_token = (
+                (
+                    instruction_tokens_length
+                    + question_tokens_length
+                    + sum(context_tokens_length)
+                )
+                * global_rate
+                - instruction_tokens_length
+                - (question_tokens_length if concate_question else 0)
+            )
+        else:
+            global_rate = global_target_token / sum(context_tokens_length)
+        context, context_segs, context_segs_rate, context_segs_compress = (
+            self.segment_structured_context(context, global_rate)
+        )
+        return self.compress_prompt(
+            context,
+            instruction,
+            question,
+            global_rate,
+            global_target_token,
+            iterative_size,
+            force_context_ids,
+            force_context_number,
+            use_sentence_level_filter,
+            use_context_level_filter,
+            use_token_level_filter,
+            keep_split,
+            keep_first_sentence,
+            keep_last_sentence,
+            keep_sentence_number,
+            high_priority_bonus,
+            context_budget,
+            token_budget_ratio,
+            condition_in_question,
+            reorder_context,
+            dynamic_context_compression_ratio,
+            condition_compare,
+            add_instruction,
+            rank_method,
+            concate_question,
+            context_segs=context_segs,
+            context_segs_rate=context_segs_rate,
+            context_segs_compress=context_segs_compress,
+        )
+
     def compress_prompt(
         self,
         context: List[str],
         instruction: str = "",
         question: str = "",
-        ratio: float = 2.0,
+        rate: float = 0.5,
         target_token: float = -1,
         iterative_size: int = 200,
         force_context_ids: List[int] = None,
@@ -181,8 +327,8 @@ def compress_prompt(
         add_instruction: bool = False,
         rank_method: str = "llmlingua",
         concate_question: bool = True,
-        context_segs: List[str] = None, 
-        context_segs_ratio: List[float] = None, 
+        context_segs: List[str] = None,
+        context_segs_rate: List[float] = None,
         context_segs_compress: List[bool] = None,
     ):
         """
@@ -192,16 +338,18 @@ def compress_prompt(
             context (List[str]): List of context strings that form the basis of the prompt.
             instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
             question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
-            ratio (float, optional): The minimum compression ratio target to be achieved. The compression ratio is defined 
-                the same as in Wikipedia [Data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio):
-                .. math::\text{Compression Ratio} = \frac{\text{Uncompressed Size}}{\text{Compressed Size}}
-                Default is 2.0. The actual compression ratio generally exceeds the specified target, but there can be 
-                fluctuations due to differences in tokenizers. If specified, it should be a float greater than or equal 
-                to 1.0, representing the target compression ratio.
-            target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target. 
-                The actual number of tokens after compression should generally be less than the specified target_token, but there can 
-                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
-                the sole criterion, overriding the ``ratio``.
+            rate (float, optional): The maximum compression rate target to be achieved. The compression rate is defined
+                the same as in paper "Language Modeling Is Compression". Delétang, Grégoire, Anian Ruoss, Paul-Ambroise Duquenne,
+                Elliot Catt, Tim Genewein, Christopher Mattern, Jordi Grau-Moya et al. "Language modeling is compression."
+                arXiv preprint arXiv:2309.10668 (2023):
+                .. math::\text{Compression Rate} = \frac{\text{Compressed Size}}{\text{Raw Size}}
+                Default is 0.5. The actual compression rate is generally lower than the specified target, but there can be
+                fluctuations due to differences in tokenizers. If specified, it should be a float less than or equal
+                to 1.0, representing the target compression rate.
+            target_token (float, optional): The maximum number of tokens to be achieved. Default is -1, indicating no specific target.
+                The actual number of tokens after compression should generally be less than the specified target_token, but there can
+                be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
+                the sole criterion, overriding the ``rate``.
             iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
             force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
             force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
@@ -228,10 +376,14 @@ def compress_prompt(
                 - "compressed_prompt" (str): The resulting compressed prompt.
                 - "origin_tokens" (int): The original number of tokens in the input.
                 - "compressed_tokens" (int): The number of tokens in the compressed output.
-                - "ratio" (str): The compression ratio achieved, in a human-readable format.
-                - "rate" (str): The compression rate achieved, calculated as the token number after compression divided by the original token number.
+                - "ratio" (str): The compression ratio achieved, calculated as the original token number divided by the token number after compression.
+                - "rate" (str): The compression rate achieved, in a human-readable format.
                 - "saving" (str): Estimated savings in GPT-4 token usage.
         """
+        assert (
+            rate <= 1.0
+        ), "Error: 'rate' must not exceed 1.0. The value of 'rate' indicates compression rate and must be within the range [0, 1]."
+
         if not context:
             context = [" "]
         if isinstance(context, str):
@@ -264,7 +416,7 @@ def compress_prompt(
                     + question_tokens_length
                     + sum(context_tokens_length)
                 )
-                * (1 / ratio)
+                * rate
                 - instruction_tokens_length
                 - (question_tokens_length if concate_question else 0)
             )
@@ -285,13 +437,15 @@ def compress_prompt(
                 rank_method=rank_method,
                 context_budget=context_budget,
                 context_segs=context_segs,
-                context_segs_ratio=context_segs_ratio,
+                context_segs_rate=context_segs_rate,
                 context_segs_compress=context_segs_compress,
             )
             if context_segs is not None:
                 context_segs = [context_segs[idx] for idx in context_used]
-                context_segs_ratio = [context_segs_ratio[idx] for idx in context_used]
-                context_segs_compress = [context_segs_compress[idx] for idx in context_used]
+                context_segs_rate = [context_segs_rate[idx] for idx in context_used]
+                context_segs_compress = [
+                    context_segs_compress[idx] for idx in context_used
+                ]
         else:
             dynamic_ratio = [0.0] * len(context)
 
@@ -309,13 +463,24 @@ def compress_prompt(
                 condition_in_question=condition_in_question,
                 rank_method=rank_method,
                 context_segs=context_segs,
-                context_segs_ratio=context_segs_ratio,
+                context_segs_rate=context_segs_rate,
                 context_segs_compress=context_segs_compress,
             )
         elif context_segs is not None:
             for context_idx in range(len(context)):
-                segments_info.append([(len(seg_text), seg_ratio, seg_compress) for seg_text, seg_ratio, seg_compress in zip(context_segs[context_idx], context_segs_ratio[context_idx], context_segs_compress[context_idx])])
-        segments_info = [self.concate_segment_info(segment_info) for segment_info in segments_info]
+                segments_info.append(
+                    [
+                        (len(seg_text), seg_rate, seg_compress)
+                        for seg_text, seg_rate, seg_compress in zip(
+                            context_segs[context_idx],
+                            context_segs_rate[context_idx],
+                            context_segs_compress[context_idx],
+                        )
+                    ]
+                )
+        segments_info = [
+            self.concate_segment_info(segment_info) for segment_info in segments_info
+        ]
 
         if condition_flag:
             prefix = question + "\n\n" + instruction if add_instruction else question
@@ -469,28 +634,39 @@ def get_structured_dynamic_compression_ratio(
         global_dynamic_rate, global_dynamic_compress, tmp_context = [], [], []
         for context_idx, text in enumerate(context):
             text_seen = 0
-            for seg_idx, (seg_len, seg_ratio, seg_compress) in enumerate(seg_info[context_idx]):
+            for seg_idx, (seg_len, seg_rate, seg_compress) in enumerate(
+                seg_info[context_idx]
+            ):
                 seg_text = text[text_seen : text_seen + seg_len]
-                if seg_idx == len(seg_info[context_idx]) - 1 and context_idx != len(context) - 1:
-                    seg_text += '\n\n'
+                if (
+                    seg_idx == len(seg_info[context_idx]) - 1
+                    and context_idx != len(context) - 1
+                ):
+                    seg_text += "\n\n"
                 tmp_context.append(seg_text)
                 if seg_compress:
-                    global_dynamic_rate.append(1 / seg_ratio)
+                    global_dynamic_rate.append(seg_rate)
                 else:
                     global_dynamic_rate.append(1.0)
                 global_dynamic_compress.append(seg_compress)
                 text_seen += seg_len
-        origin_text = '\n\n'.join(context)
+        origin_text = "\n\n".join(context)
         assert len("".join(tmp_context)) == len(origin_text)
-        dynamic_compression_ratio = self.token_segment(origin_text, iterative_size, tmp_context, global_dynamic_rate, global_dynamic_compress)
+        dynamic_compression_ratio = self.token_segment(
+            origin_text,
+            iterative_size,
+            tmp_context,
+            global_dynamic_rate,
+            global_dynamic_compress,
+        )
         return dynamic_compression_ratio
 
     def token_segment(
-        self, 
-        text: str, 
-        iterative_size: int, 
-        segments: List[str], 
-        global_dynamic_rate: List[float], 
+        self,
+        text: str,
+        iterative_size: int,
+        segments: List[str],
+        global_dynamic_rate: List[float],
         global_dynamic_compress: List[bool],
     ):
         assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
@@ -501,14 +677,21 @@ def token_segment(
         dynamic_compression_rate, local_compresssion_rate = [], []
         for i in range(len(text_input_ids)):
             if i < decode_window:
-                id_pre, id_cur = text_input_ids[: i], text_input_ids[: i + 1]
+                id_pre, id_cur = text_input_ids[:i], text_input_ids[: i + 1]
             else:
-                id_pre, id_cur = text_input_ids[i - decode_window + 1: i], text_input_ids[i - decode_window + 1: i + 1]
-            cur_word = self.tokenizer.decode(id_cur)[len(self.tokenizer.decode(id_pre)):]
+                id_pre, id_cur = (
+                    text_input_ids[i - decode_window + 1 : i],
+                    text_input_ids[i - decode_window + 1 : i + 1],
+                )
+            cur_word = self.tokenizer.decode(id_cur)[
+                len(self.tokenizer.decode(id_pre)) :
+            ]
             cur_word_len = len(cur_word)
             if cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen:
                 possible_rate, possible_compress = [], []
-                while cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen:
+                while (
+                    cur_word_len and cur_word_len >= len(segments[seg_idx]) - seg_seen
+                ):
                     possible_rate.append(global_dynamic_rate[seg_idx])
                     possible_compress.append(global_dynamic_compress[seg_idx])
                     cur_word_len -= len(segments[seg_idx]) - seg_seen
@@ -532,7 +715,9 @@ def token_segment(
                 dynamic_compression_rate.append(local_compresssion_rate[:])
                 local_compresssion_rate = []
         if token_seen_num != len(text_input_ids):
-            local_compresssion_rate.append((len(text_input_ids) - token_seen_num, last_rate))
+            local_compresssion_rate.append(
+                (len(text_input_ids) - token_seen_num, last_rate)
+            )
         if local_compresssion_rate != []:
             dynamic_compression_rate.append(local_compresssion_rate[:])
         return dynamic_compression_rate
@@ -551,7 +736,7 @@ def control_context_budget(
         rank_method: str = "longllmlingua",
         context_budget: str = "+100",
         context_segs: List[List[str]] = None,
-        context_segs_ratio: List[List[float]] = None,
+        context_segs_rate: List[List[float]] = None,
         context_segs_compress: List[List[bool]] = None,
     ):
         demostrations_sort = self.get_rank_results(
@@ -619,7 +804,7 @@ def control_sentence_budget(
         condition_in_question: str = "none",
         rank_method: str = "longllmlingua",
         context_segs: List[List[str]] = None,
-        context_segs_ratio: List[List[float]] = None,
+        context_segs_rate: List[List[float]] = None,
         context_segs_compress: List[List[bool]] = None,
     ):
         def keep_sentence(dem_idx: int, sent_keep: int):
@@ -629,7 +814,7 @@ def keep_sentence(dem_idx: int, sent_keep: int):
 
         def sync_sentence(segments, text):
             seg_num = len(segments)
-            new_segments= []
+            new_segments = []
             text_seen = 0
             seg_idx, cur_seg_seen = 0, 0
             for i, s in enumerate(text):
@@ -653,7 +838,7 @@ def sync_sentence(segments, text):
                 new_segments.append(text[text_seen:])
             assert len("".join(new_segments)) == len(text)
             return new_segments
-                 
+
         sentences = [nltk.sent_tokenize(c) for c in context]
         dem_g, s2de, idx = defaultdict(set), defaultdict(int), 0
         for idx_d, s in enumerate(sentences):
@@ -663,7 +848,9 @@ def sync_sentence(segments, text):
                 idx += 1
 
         if context_segs is not None:
-            context_segs = [sync_sentence(s, "".join(c)) for s, c in zip(context_segs, sentences)]
+            context_segs = [
+                sync_sentence(s, "".join(c)) for s, c in zip(context_segs, sentences)
+            ]
             sen2seg_ratio = {}
             idx = 0
             for idx_d, sentences_each_context in enumerate(sentences):
@@ -675,17 +862,29 @@ def sync_sentence(segments, text):
                     while remain:
                         if segments_length[seg_idx] - cur_seg_seen <= remain:
                             new_seg_len = segments_length[seg_idx] - cur_seg_seen
-                            sentence_seg_ratio.append((new_seg_len, context_segs_ratio[idx_d][seg_idx], context_segs_compress[idx_d][seg_idx]))
+                            sentence_seg_ratio.append(
+                                (
+                                    new_seg_len,
+                                    context_segs_rate[idx_d][seg_idx],
+                                    context_segs_compress[idx_d][seg_idx],
+                                )
+                            )
                             seg_idx += 1
                             cur_seg_seen = 0
                             remain -= new_seg_len
                         else:
-                            sentence_seg_ratio.append((remain, context_segs_ratio[idx_d][seg_idx], context_segs_compress[idx_d][seg_idx]))
+                            sentence_seg_ratio.append(
+                                (
+                                    remain,
+                                    context_segs_rate[idx_d][seg_idx],
+                                    context_segs_compress[idx_d][seg_idx],
+                                )
+                            )
                             cur_seg_seen += remain
                             remain = 0
                     sen2seg_ratio[idx] = sentence_seg_ratio
                     idx += 1
-                    
+
         context_sentences = [s for ii in sentences for s in ii]
         sentence_tokens_length = [
             self.get_token_length(sentence) for sentence in context_sentences
@@ -739,7 +938,7 @@ def sync_sentence(segments, text):
             sentence_flags[idx] = True
             if target_token < 0:
                 break
-        
+
         if context_segs is not None:
             for idx in range(N):
                 preserved = [sen_seg_info[2] for sen_seg_info in sen2seg_ratio[idx]]
@@ -756,13 +955,28 @@ def sync_sentence(segments, text):
                 segment_ratio = []
                 for ii in range(len(s)):
                     if sentence_flags[idx + ii]:
-                        last_element = (sen2seg_ratio[idx + ii][-1][0] + 1, sen2seg_ratio[idx + ii][-1][1], sen2seg_ratio[idx + ii][-1][2])
-                        segment_ratio.extend(sen2seg_ratio[idx + ii][:-1] + [last_element])
-                segment_ratio = segment_ratio[:-1] + [(segment_ratio[-1][0] - 1, segment_ratio[-1][1], segment_ratio[-1][2])]
-                new_segments_info.append(segment_ratio)       
+                        last_element = (
+                            sen2seg_ratio[idx + ii][-1][0] + 1,
+                            sen2seg_ratio[idx + ii][-1][1],
+                            sen2seg_ratio[idx + ii][-1][2],
+                        )
+                        segment_ratio.extend(
+                            sen2seg_ratio[idx + ii][:-1] + [last_element]
+                        )
+                segment_ratio = segment_ratio[:-1] + [
+                    (
+                        segment_ratio[-1][0] - 1,
+                        segment_ratio[-1][1],
+                        segment_ratio[-1][2],
+                    )
+                ]
+                new_segments_info.append(segment_ratio)
             idx += len(s)
         if context_segs is not None:
-            new_segments_info = [self.concate_segment_info(segment_info) for segment_info in new_segments_info]
+            new_segments_info = [
+                self.concate_segment_info(segment_info)
+                for segment_info in new_segments_info
+            ]
         return res, new_segments_info
 
     def get_compressed_input(
@@ -875,7 +1089,7 @@ def get_estimate_threshold_base_distribution(
         self, ppl, ratio: float, condition_flag: bool = False
     ):
         if ratio == 1.0:
-            return float('-inf')
+            return float("-inf")
         ppl = ppl[ppl != 10000]
         target_token = max(0, min(len(ppl) - 1, int(len(ppl) * ratio) - 1))
         return (
@@ -897,7 +1111,7 @@ def iterative_compress_prompt(
         dynamic_ratio: list = None,
         condition_compare: bool = False,
         segments_info: List[List[tuple]] = None,
-    ):  
+    ):
         if segments_info is None or segments_info == []:
             iterative_ratios = self.get_dynamic_compression_ratio(
                 context, target_token, iterative_size, dynamic_ratio, start
@@ -907,7 +1121,9 @@ def iterative_compress_prompt(
                 context, iterative_size, dynamic_ratio, start, segments_info
             )
         context = "\n\n".join(context)
-        tokenized_text = self.tokenizer(context, return_tensors="pt", add_special_tokens=False)
+        tokenized_text = self.tokenizer(
+            context, return_tensors="pt", add_special_tokens=False
+        )
         input_ids = tokenized_text["input_ids"].to(self.device)
         attention_mask = tokenized_text["attention_mask"].to(self.device)
 
@@ -951,7 +1167,9 @@ def iterative_compress_prompt(
         while end <= compressed_input_ids.shape[1]:
             if end > self.max_position_embeddings and past_key_values is not None:
                 # KV-Cache Compression
-                e, s = end - self.max_position_embeddings, min(self.cache_bos_num + start, self.max_position_embeddings)
+                e, s = end - self.max_position_embeddings, min(
+                    self.cache_bos_num + start, self.max_position_embeddings
+                )
                 if pop_compressed_input_ids is None:
                     pop_compressed_input_ids = compressed_input_ids[:, :e]
                 else:
@@ -1071,7 +1289,7 @@ def iterative_compress_prompt(
                 else:
                     threshold = self.get_estimate_threshold_base_distribution(
                         loss, ratio, False
-                            )
+                    )
 
                 (
                     compressed_input_ids,
@@ -1093,12 +1311,12 @@ def iterative_compress_prompt(
                     split_token_id=split_token_id,
                     start=start,
                     self_loss=self_loss if condition_compare else None,
-                    self_input_ids=self_compressed_input_ids
-                    if condition_compare
-                    else None,
-                    self_attention_mask=self_compressed_attention_mask
-                    if condition_compare
-                    else None,
+                    self_input_ids=(
+                        self_compressed_input_ids if condition_compare else None
+                    ),
+                    self_attention_mask=(
+                        self_compressed_attention_mask if condition_compare else None
+                    ),
                 )
                 end += iterative_size
             idx += 1
@@ -1430,190 +1648,81 @@ def get_distance_longllmlingua(corpus, query):
             method = get_distance_cohere
         return method(context, question)
 
-    def structured_compress_prompt(
+    def segment_structured_context(
         self,
         context: List[str],
-        instruction: str = "",
-        question: str = "",
-        global_ratio: float = 2.0,
-        global_target_token: float = -1,
-        iterative_size: int = 200,
-        force_context_ids: List[int] = None,
-        force_context_number: int = None,
-        use_sentence_level_filter: bool = False,
-        use_context_level_filter: bool = True,
-        use_token_level_filter: bool = True,
-        keep_split: bool = False,
-        keep_first_sentence: int = 0,
-        keep_last_sentence: int = 0,
-        keep_sentence_number: int = 0,
-        high_priority_bonus: int = 100,
-        context_budget: str = "+100",
-        token_budget_ratio: float = 1.4,
-        condition_in_question: str = "none",
-        reorder_context: str = "original",
-        dynamic_context_compression_ratio: float = 0.0,
-        condition_compare: bool = False,
-        add_instruction: bool = False,
-        rank_method: str = "llmlingua",
-        concate_question: bool = True,
+        global_rate: float,
     ):
-        """
-        Compresses the given prompt context based on a specified structure.
-
-        Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags. Each '<llmlingua>' tag 
-        can include optional parameters 'ratio' and 'compress' (e.g., '<llmlingua, ratio=1.5, compress=True>'), 
-        indicating the compression ratio for that segment. Default values are 'ratio=global_ratio' and 'compress=True'. 
-        When 'compress' is set to False, it overrides the 'ratio' parameter, resulting in no compression for that segment.
-
-        Args:
-            context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
-            instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
-            question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
-            global_ratio (float, optional): The compression ratio is defined  the same as in Wikipedia [Data compression ratio]
-                (https://en.wikipedia.org/wiki/Data_compression_ratio):
-                .. math::\text{Compression Ratio} = \frac{\text{Uncompressed Size}}{\text{Compressed Size}}
-                Default is 2.0. The actual compression ratio generally exceeds the specified target, but there can be 
-                fluctuations due to differences in tokenizers. If specified, it should be a float greater than or equal 
-                to 1.0, representing the target compression ratio. ``global_ratio``, is applicable only within the context-level filter 
-                and the sentence-level filter. In the token-level filter, the ratio for each segment overrides the global ratio. 
-                However, for segments where no specific ratio is defined, the global ratio serves as the default value. The final 
-                compression ratio of the entire text is a composite result of multiple compression ratios applied across different sections.
-            global_target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no 
-                specific target. The actual number of tokens after compression should generally be less than the specified target_token,
-                but there can be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as 
-                the sole criterion, overriding the ``global_ratio``. ``global_target_token``, is applicable only within the context-level
-                filter and the sentence-level filter. In the token-level filter, the ratio for each segment overrides the global target token. 
-                However, for segments where no specific ratio is defined, the global ratio calculated from global target token serves 
-                as the default value. The final target token of the entire text is a composite result of multiple compression ratios
-                applied across different sections.
-            iterative_size (int, optional): The number of tokens to consider in each iteration of compression. Default is 200.
-            force_context_ids (List[int], optional): List of specific context IDs to always include in the compressed result. Default is None.
-            force_context_number (int, optional): The number of context sections to forcibly include. Default is None.
-            use_sentence_level_filter (bool, optional): Whether to apply sentence-level filtering in compression. Default is False.
-            use_context_level_filter (bool, optional): Whether to apply context-level filtering in compression. Default is True.
-            use_token_level_filter (bool, optional): Whether to apply token-level filtering in compression. Default is True.
-            keep_split (bool, optional): Whether to preserve the original separators without compression. Default is False.
-            keep_first_sentence (int, optional): Number of sentences to forcibly preserve from the start of the context. Default is 0.
-            keep_last_sentence (int, optional): Number of sentences to forcibly preserve from the end of the context. Default is 0.
-            keep_sentence_number (int, optional): Total number of sentences to forcibly preserve in the compression. Default is 0.
-            high_priority_bonus (int, optional): Bonus score for high-priority sentences to influence their likelihood of being retained. Default is 100.
-            context_budget (str, optional): Token budget for the context-level filtering, expressed as a string to indicate flexibility. Default is "+100".
-            token_budget_ratio (float, optional): Ratio to adjust token budget during sentence-level filtering. Default is 1.4.
-            condition_in_question (str, optional): Specific condition to apply to question in the context. Default is "none".
-            reorder_context (str, optional): Strategy for reordering context in the compressed result. Default is "original".
-            dynamic_context_compression_ratio (float, optional): Ratio for dynamically adjusting context compression. Default is 0.0.
-            condition_compare (bool, optional): Whether to enable condition comparison during token-level compression. Default is False.
-            add_instruction (bool, optional): Whether to add the instruction to the prompt prefix. Default is False.
-            rank_method (str, optional): Method used for ranking elements during compression. Default is "llmlingua".
-            concate_question (bool, optional): Whether to concatenate the question to the compressed prompt. Default is True.
-
-        Returns:
-            dict: A dictionary containing:
-                - "compressed_prompt" (str): The resulting compressed prompt.
-                - "origin_tokens" (int): The original number of tokens in the input.
-                - "compressed_tokens" (int): The number of tokens in the compressed output.
-                - "ratio" (str): The compression ratio achieved, in a human-readable format.
-                - "rate" (str): The compression rate achieved, calculated as the token number after compression divided by the original token number.
-                - "saving" (str): Estimated savings in GPT-4 token usage.
-        """
-        if not context:
-            context = [" "]
-        if isinstance(context, str):
-            context = [context]
-            
-        context_tokens_length = [self.get_token_length(c) for c in context]
-        instruction_tokens_length, question_tokens_length = self.get_token_length(
-            instruction
-        ), self.get_token_length(question)
-        if global_target_token == -1:
-            global_target_token = (
-                (
-                    instruction_tokens_length
-                    + question_tokens_length
-                    + sum(context_tokens_length)
-                )
-                * (1 / global_ratio)
-                - instruction_tokens_length
-                - (question_tokens_length if concate_question else 0)
-            )
-        else:
-            global_ratio = global_target_token / sum(context_tokens_length)
-
-        context, context_segs, context_segs_ratio, context_segs_compress = self.segment_structured_context(context, global_ratio)
-        return self.compress_prompt(
-            context, 
-            instruction, 
-            question, 
-            global_ratio,
-            global_target_token, 
-            iterative_size, 
-            force_context_ids, 
-            force_context_number,
-            use_sentence_level_filter, 
-            use_context_level_filter, 
-            use_token_level_filter, 
-            keep_split, 
-            keep_first_sentence, 
-            keep_last_sentence, 
-            keep_sentence_number, 
-            high_priority_bonus, 
-            context_budget, 
-            token_budget_ratio, 
-            condition_in_question, 
-            reorder_context, 
-            dynamic_context_compression_ratio, 
-            condition_compare, 
-            add_instruction,
-            rank_method, 
-            concate_question,
-            context_segs=context_segs, 
-            context_segs_ratio=context_segs_ratio, 
-            context_segs_compress=context_segs_compress,
+        new_context, context_segs, context_segs_rate, context_segs_compress = (
+            [],
+            [],
+            [],
+            [],
         )
-        
-    def segment_structured_context(
-        self, 
-        context: List[str],
-        global_ratio: float,
-        ):
-        new_context, context_segs, context_segs_ratio, context_segs_compress = [], [], [], []
         for text in context:
             if not text.startswith("<llmlingua"):
                 text = "<llmlingua>" + text
             if not text.endswith("</llmlingua>"):
                 text = text + "</llmlingua>"
-            
-            # Regular expression to match <llmlingua, ratio=x, compress=y>content</llmlingua>, allowing ratio and compress in any order
-            pattern = r"<llmlingua\s*(?:,\s*ratio\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*ratio\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>([^<]+)</llmlingua>"
+
+            # Regular expression to match <llmlingua, rate=x, compress=y>content</llmlingua>, allowing rate and compress in any order
+            pattern = r"<llmlingua\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>([^<]+)</llmlingua>"
             matches = re.findall(pattern, text)
 
             # Extracting segment contents
             segments = [match[4] for match in matches]
 
-            # Extracting ratio and compress, considering their possible positions
-            segs_ratio = [float(match[0]) if match[0] else (float(match[2]) if match[2] else None) for match in matches]
-            segs_compress = [(match[1] == 'True' if match[1] else (match[3] == 'True' if match[3] else None)) for match in matches]
-            
-            segs_compress = [compress if compress is not None else True for compress in segs_compress]
-            segs_ratio = [ratio if ratio else (global_ratio if compress else 1.0) for ratio, compress in zip(segs_ratio, segs_compress)]
-            assert len(segments) == len(segs_ratio) == len(segs_compress), "The number of segments, ratios, and compress flags should be the same."
+            # Extracting rate and compress, considering their possible positions
+            segs_rate = [
+                float(match[0]) if match[0] else (float(match[2]) if match[2] else None)
+                for match in matches
+            ]
+            segs_compress = [
+                (
+                    match[1] == "True"
+                    if match[1]
+                    else (match[3] == "True" if match[3] else None)
+                )
+                for match in matches
+            ]
+
+            segs_compress = [
+                compress if compress is not None else True for compress in segs_compress
+            ]
+            segs_rate = [
+                rate if rate else (global_rate if compress else 1.0)
+                for rate, compress in zip(segs_rate, segs_compress)
+            ]
+            assert (
+                len(segments) == len(segs_rate) == len(segs_compress)
+            ), "The number of segments, rates, and compress flags should be the same."
+            assert all(
+                seg_rate <= 1.0 for seg_rate in segs_rate
+            ), "Error: 'rate' must not exceed 1.0. The value of 'rate' indicates compression rate and must be within the range [0, 1]."
 
             new_context.append("".join(segments))
             context_segs.append(segments)
-            context_segs_ratio.append(segs_ratio)
+            context_segs_rate.append(segs_rate)
             context_segs_compress.append(segs_compress)
 
-        return new_context, context_segs, context_segs_ratio, context_segs_compress
+        return new_context, context_segs, context_segs_rate, context_segs_compress
 
     def concate_segment_info(
-        self, 
+        self,
         segment_info: List[List[tuple]],
-        ):
+    ):
         new_segment_info = []
         for i, (seg_len, seg_ratio, seg_compress) in enumerate(segment_info):
-            if new_segment_info and new_segment_info[-1][1] == seg_ratio and new_segment_info[-1][2] == seg_compress:
-                new_segment_info[-1] = (new_segment_info[-1][0] + seg_len, seg_ratio, seg_compress)
+            if (
+                new_segment_info
+                and new_segment_info[-1][1] == seg_ratio
+                and new_segment_info[-1][2] == seg_compress
+            ):
+                new_segment_info[-1] = (
+                    new_segment_info[-1][0] + seg_len,
+                    seg_ratio,
+                    seg_compress,
+                )
             else:
                 new_segment_info.append((seg_len, seg_ratio, seg_compress))
-        return new_segment_info
\ No newline at end of file
+        return new_segment_info

From 39058149f8942a175cf80dccbc6f0c6fad696975 Mon Sep 17 00:00:00 2001
From: siyunzhao <siyunzhao@microsoft.com>
Date: Tue, 20 Feb 2024 08:42:55 +0000
Subject: [PATCH 11/19] change parameter name

---
 llmlingua/prompt_compressor.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 724f039..045f57f 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -161,8 +161,8 @@ def structured_compress_prompt(
         context: List[str],
         instruction: str = "",
         question: str = "",
-        global_rate: float = 0.5,
-        global_target_token: float = -1,
+        rate: float = 0.5,
+        target_token: float = -1,
         iterative_size: int = 200,
         force_context_ids: List[int] = None,
         force_context_number: int = None,
@@ -189,27 +189,27 @@ def structured_compress_prompt(
 
         Each element of context should be segmented using one or more non-nested '<llmlingua></llmlingua>' tags.
         Each '<llmlingua>' tag can include optional parameters 'rate' and 'compress' (e.g., '<llmlingua, rate=0.3, compress=True>'),
-        indicating the compression rate for that segment. Default values are 'rate=global_rate' and 'compress=True'.
+        indicating the compression rate for that segment. Default values are 'rate=rate' and 'compress=True'.
         When 'compress' is set to False, it overrides the 'rate' parameter, resulting in no compression for that segment.
 
         Args:
             context (List[str]): List of context strings divided by '<llmlingua></llmlingua>' tags with optional compression settings.
             instruction (str, optional): Additional instruction text to be included in the prompt. Default is an empty string.
             question (str, optional): A specific question that the prompt is addressing. Default is an empty string.
-            global_rate (float, optional): The compression rate is defined the same as in paper "Language Modeling Is Compression".
+            rate (float, optional): The compression rate is defined the same as in paper "Language Modeling Is Compression".
                 Delétang, Grégoire, Anian Ruoss, Paul-Ambroise Duquenne, Elliot Catt, Tim Genewein, Christopher Mattern,
                 Jordi Grau-Moya et al. "Language modeling is compression." arXiv preprint arXiv:2309.10668 (2023):
                 .. math::\text{Compression Rate} = \frac{\text{Compressed Size}}{\text{Raw Size}}
                 Default is 0.5. The actual compression rate is generally lower than the specified target, but there can be
                 fluctuations due to differences in tokenizers. If specified, it should be a float less than or equal
-                to 1.0, representing the target compression rate. ``global_rate``, is applicable only within the context-level filter
+                to 1.0, representing the target compression rate. ``rate``, is applicable only within the context-level filter
                 and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global rate.
                 However, for segments where no specific rate is defined, the global rate serves as the default value. The final
                 compression rate of the entire text is a composite result of multiple compression rates applied across different sections.
-            global_target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no
+            target_token (float, optional): The global maximum number of tokens to be achieved. Default is -1, indicating no
                 specific target. The actual number of tokens after compression should generally be less than the specified target_token,
                 but there can be fluctuations due to differences in tokenizers. If specified, compression will be based on the target_token as
-                the sole criterion, overriding the ``global_rate``. ``global_target_token``, is applicable only within the context-level
+                the sole criterion, overriding the ``rate``. ``target_token``, is applicable only within the context-level
                 filter and the sentence-level filter. In the token-level filter, the rate for each segment overrides the global target token.
                 However, for segments where no specific rate is defined, the global rate calculated from global target token serves
                 as the default value. The final target token of the entire text is a composite result of multiple compression rates
@@ -253,28 +253,28 @@ def structured_compress_prompt(
         instruction_tokens_length, question_tokens_length = self.get_token_length(
             instruction
         ), self.get_token_length(question)
-        if global_target_token == -1:
-            global_target_token = (
+        if target_token == -1:
+            target_token = (
                 (
                     instruction_tokens_length
                     + question_tokens_length
                     + sum(context_tokens_length)
                 )
-                * global_rate
+                * rate
                 - instruction_tokens_length
                 - (question_tokens_length if concate_question else 0)
             )
         else:
-            global_rate = global_target_token / sum(context_tokens_length)
+            rate = target_token / sum(context_tokens_length)
         context, context_segs, context_segs_rate, context_segs_compress = (
-            self.segment_structured_context(context, global_rate)
+            self.segment_structured_context(context, rate)
         )
         return self.compress_prompt(
             context,
             instruction,
             question,
-            global_rate,
-            global_target_token,
+            rate,
+            target_token,
             iterative_size,
             force_context_ids,
             force_context_number,

From 2cd5c7e82babd770a83f48649ce218124e4074d1 Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Tue, 20 Feb 2024 14:16:24 +0000
Subject: [PATCH 12/19] Feature(LLMLingua): add tempature & test scripts

---
 .github/ISSUE_TEMPLATE/bug_report.yml      | 47 ++++++++++++++++
 .github/ISSUE_TEMPLATE/config.yml          |  1 +
 .github/ISSUE_TEMPLATE/feature_request.yml | 26 +++++++++
 .github/ISSUE_TEMPLATE/general_issue.yml   | 12 +++++
 .github/PULL_REQUEST_TEMPLATE.md           | 43 +++++++++++++++
 .github/workflows/release.yml              | 40 ++++++++++++++
 .github/workflows/unittest.yml             | 42 +++++++++++++++
 .pre-commit-config.yaml                    | 51 ++++++++++++++++++
 Makefile                                   | 13 ++---
 llmlingua/version.py                       |  2 +-
 setup.py                                   |  3 ++
 tests/test_llmlingua.py                    | 45 ++++++++++++++++
 tests/test_longllmlingua.py                | 62 ++++++++++++++++++++++
 13 files changed, 380 insertions(+), 7 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/general_issue.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .github/workflows/unittest.yml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 tests/test_llmlingua.py
 create mode 100644 tests/test_longllmlingua.py

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000..80f8c6e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,47 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve LLMLingua
+title: "[Bug]: "
+labels: ["bug"]
+
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is.
+      placeholder: What went wrong?
+  - type: textarea
+    id: reproduce
+    attributes:
+      label: Steps to reproduce
+      description: |
+        Steps to reproduce the behavior:
+
+        1. Step 1
+        2. Step 2
+        3. ...
+        4. See error
+      placeholder: How can we replicate the issue?
+  - type: textarea
+    id: expected_behavior
+    attributes:
+      label: Expected Behavior
+      description: A clear and concise description of what you expected to happen.
+      placeholder: What should have happened?
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: If applicable, add logs or screenshots to help explain your problem.
+      placeholder: Add logs here
+  - type: textarea
+    id: additional_information
+    attributes:
+      label: Additional Information
+      description: |
+        - LLMLingua Version: <!-- Specify the LLMLingua version (e.g., v0.1.0) -->
+        - Operating System: <!-- Specify the OS (e.g., Windows 10, Ubuntu 20.04) -->
+        - Python Version: <!-- Specify the Python version (e.g., 3.8) -->
+        - Related Issues: <!-- Link to any related issues here (e.g., #1) -->
+        - Any other relevant information.
+      placeholder: Any additional details
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000..a49eab2
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 0000000..03994b3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,26 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new LLMLingua feature
+labels: ["feature request"]
+title: "[Feature Request]: "
+
+body:
+  - type: textarea
+    id: problem_description
+    attributes:
+      label: Is your feature request related to a problem? Please describe.
+      description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+      placeholder: What problem are you trying to solve?
+
+  - type: textarea
+    id: solution_description
+    attributes:
+      label: Describe the solution you'd like
+      description: A clear and concise description of what you want to happen.
+      placeholder: How do you envision the solution?
+
+  - type: textarea
+    id: additional_context
+    attributes:
+      label: Additional context
+      description: Add any other context or screenshots about the feature request here.
+      placeholder: Any additional information
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/general_issue.yml b/.github/ISSUE_TEMPLATE/general_issue.yml
new file mode 100644
index 0000000..1aa6d79
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/general_issue.yml
@@ -0,0 +1,12 @@
+name: "\U0001F31F General Question"
+description: File a general question
+title: "[Question]: "
+labels: ["question"]
+
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Describe the issue
+      description: A clear and concise description of what the question is.
+      placeholder: The detail of question.
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..2e75eb0
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,43 @@
+# What does this PR do?
+
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+
+<!-- Remove if not applicable -->
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Was this discussed/approved via a Github issue? Please add a link
+      to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes?
+- [ ] Did you write any new necessary tests?
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors who may be interested in your PR.
+
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @
+
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+
+LLMLingua/LongLLMLingua:
+
+- general: @iofu728, @QianhuiWu, @XufangLuo, and @mydmdm
+- new feature: @SiyunZhao
+
+Documentation: @SiyunZhao
+
+ -->
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..8944a8a
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,40 @@
+ # This workflows will build and upload a Python Package using Twine when a release is published
+# Conda-forge bot will pick up new PyPI version and automatically create new version
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: release
+run-name: Release LLMLingua by @${{ github.actor }}
+
+on:
+  release:
+    types: [published]
+permissions: {}
+
+jobs:
+  deploy:
+    strategy:
+      matrix:
+        os: ['ubuntu-latest']
+        python-version: [3.10]
+    runs-on: ${{ matrix.os }}
+    environment: package
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Install from source
+        # This is required for the pre-commit tests
+        shell: pwsh
+        run: pip install .
+
+      - name: Build
+        shell: pwsh
+        run: |
+          pip install twine
+          python setup.py sdist bdist_wheel
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        shell: pwsh
+        run: twine upload dist/*
\ No newline at end of file
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
new file mode 100644
index 0000000..a2bfcf4
--- /dev/null
+++ b/.github/workflows/unittest.yml
@@ -0,0 +1,42 @@
+name: Unit Test
+
+# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
+on:  # Trigger the workflow on pull request or merge
+  pull_request:
+  merge_group:
+    types: [checks_requested]
+
+defaults:
+  run:
+    shell: bash
+permissions: {}
+
+jobs:
+  UniTest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2019]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        name: Setup python ${{ inputs.python-version }}
+        id: setup-python
+        with:
+          python-version: ${{ inputs.python-version }}
+
+      - name: Install packages and dependencies for all tests
+        run: |
+          python -m pip install --upgrade pip wheel
+          pip install pytest pytest-xdist
+
+      - name: Install packages
+        run: |
+          pip install -e .
+
+      - name: Run core tests
+        shell: bash
+        run: |
+          make test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..012b522
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,51 @@
+default_language_version:
+  python: python3
+exclude: 'dotnet'
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: 'quarterly'
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    - id: check-added-large-files
+    - id: check-ast
+    - id: check-yaml
+    - id: check-toml
+    - id: check-json
+    - id: check-byte-order-marker
+      exclude: .gitignore
+    - id: check-merge-conflict
+    - id: detect-private-key
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+    - id: no-commit-to-branch
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+    - id: black
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.261
+    hooks:
+      - id: ruff
+        args: ["--fix"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.6
+    hooks:
+      - id: codespell
+        args: ["-L", "ans,linar,nam,"]
+        exclude: |
+            (?x)^(
+              pyproject.toml |
+              website/static/img/ag.svg |
+              website/yarn.lock |
+              notebook/.*
+            )$
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.7.1
+    hooks:
+      - id: nbqa-ruff
+        args: ["--fix"]
+      - id: nbqa-black
\ No newline at end of file
diff --git a/Makefile b/Makefile
index b7fa0bf..8bd4d6b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,7 @@
-.PHONY: install style_check_on_modified style
+.PHONY: install style test
 
-export PYTHONPATH = src
-
-PYTHON := python3
-CHECK_DIRS := llmlingua
+PYTHON := python
+CHECK_DIRS := llmlingua tests
 
 install:
 	@${PYTHON} setup.py bdist_wheel
@@ -12,4 +10,7 @@ install:
 style:
 	black $(CHECK_DIRS)
 	isort -rc $(CHECK_DIRS)
-	flake8 $(CHECK_DIRS)
\ No newline at end of file
+	flake8 $(CHECK_DIRS)
+
+test:
+	@${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/
\ No newline at end of file
diff --git a/llmlingua/version.py b/llmlingua/version.py
index d0fa52b..01971da 100644
--- a/llmlingua/version.py
+++ b/llmlingua/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2023 Microsoft
 # Licensed under The MIT License [see LICENSE for details]
- 
+
 _MAJOR = "0"
 _MINOR = "1"
 # On master and in a nightly release the patch should be one ahead of the last
diff --git a/setup.py b/setup.py
index 9be6c0e..eac19b6 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,9 @@
     "black==21.4b0",
     "flake8>=3.8.3",
     "isort>=5.5.4",
+    "pre-commit",
+    "pytest",
+    "pytest-xdist",
 ]
 DEV_REQUIRES = INSTALL_REQUIRES + QUANLITY_REQUIRES
 
diff --git a/tests/test_llmlingua.py b/tests/test_llmlingua.py
new file mode 100644
index 0000000..687540e
--- /dev/null
+++ b/tests/test_llmlingua.py
@@ -0,0 +1,45 @@
+import unittest
+import unittest.mock as mock
+
+from llmlingua import PromptCompressor
+
+
+class LLMLinguaTester(unittest.TestCase):
+    """
+    End2end Test for LLMLingua
+    """
+
+    GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
+    GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo and Melanie to plan how many hours they should together their test have 2 their textbook and 4 to They out should and 1 hours. they study, many they study total week they a break every hour, include 3minute and lunch day\n's think step\n Melanie should the chapters hours 2 = hours\n the to dedicate x\n Melanie to with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4"
+    GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "Question: You can buy 4 apples or 1 for. You bought 36 fruits evenly split between, waterons and of 1 orange $.. much does cost if your total bill $\n's think step\nIf were between 3 of, then I 36/3 = 12 of fruitIf 1 orange50 then oranges50 * $If66 I $ oranges I $66 $60 on the other 2 fruit\nAssuming the of is W, and that you price and of is then 1W=4AIf we know we bought 12 and, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
+
+    def __init__(self, *args, **kwargs):
+        super(LLMLinguaTester, self).__init__(*args, **kwargs)
+        self.llmlingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
+
+    def test_general_compress_prompt(self):
+        # Single Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n")[0], target_token=150
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 422)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 293)
+        self.assertEqual(compressed_prompt["ratio"], "1.4x")
+        self.assertEqual(compressed_prompt["rate"], "69.4%")
+
+        # Multiple Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n"), target_token=150
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 727)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 206)
+        self.assertEqual(compressed_prompt["ratio"], "3.5x")
+        self.assertEqual(compressed_prompt["rate"], "28.3%")
diff --git a/tests/test_longllmlingua.py b/tests/test_longllmlingua.py
new file mode 100644
index 0000000..83808ea
--- /dev/null
+++ b/tests/test_longllmlingua.py
@@ -0,0 +1,62 @@
+import unittest
+import unittest.mock as mock
+
+from llmlingua import PromptCompressor
+
+
+class LongLLMLinguaTester(unittest.TestCase):
+    """
+    End2end Test for LongLLMLingua
+    """
+
+    GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1\n\nQuestion: Susy goes to a large school with 800 students, while Sarah goes to a smaller school with only 300 students.  At the start of the school year, Susy had 100 social media followers.  She gained 40 new followers in the first week of the school year, half that in the second week, and half of that in the third week.  Sarah only had 50 social media followers at the start of the year, but she gained 90 new followers the first week, a third of that in the second week, and a third of that in the third week.  After three weeks, how many social media followers did the girl with the most total followers have?\nLet's think step by step\nAfter one week, Susy has 100+40 = 140 followers.\nIn the second week, Susy gains 40/2 = 20 new followers.\nIn the third week, Susy gains 20/2 = 10 new followers.\nIn total, Susy finishes the three weeks with 140+20+10 = 170 total followers.\nAfter one week, Sarah has 50+90 = 140 followers.\nAfter the second week, Sarah gains 90/3 = 30 followers.\nAfter the third week, Sarah gains 30/3 = 10 followers.\nSo, Sarah finishes the three weeks with 140+30+10 = 180 total followers.\nThus, Sarah is the girl with the most total followers with a total of 180.\nThe answer is 180"
+    GSM8K_250TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie want plan many over the together for their next week chapters of to study worksheets memorize They out they should hours each chapter 1. hours each worksheet plan study no each day, many plan to total take a 10minute break every hour 10- snack breaks each day, 30 minutes each day?Let think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of 2 chapters, x chapters hours total.For worksheets plan to 1.5 for works,.5 hours 4 worksheets hours total and Melanie need with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
+    GSM8K_250TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "Question: You or 1 watermelon the price You bought 36 evenly split between oranges, watermelons, and price 1 orange is $0.50 How apple if total bill was $?Let's think step by stepIf were evenly split 3 types of fruits, 36/ = 12 units of fruit\nIf 1 orange $0. then will cost $0.50 * 126\nIf my total bill was $ and I spent $6 on oranges then I spent $66 - $6 = $ the other 2 fruit types.\nAssuming price is W knowing you 4 apples the same price that price one is A, 1=4A\n we we bought watermelons and apples for $60 we know that $ 12W + 12A\nKnowing that 1W=4A, we can convert the above to $ 12(4A) + 12A$60 = 48A + 12A\n$60 = 60\nThen know price of one apple (A) is $60/60= $\nThe answer is 1\n\nQuestiony while.  After three weeks, how many social media followers did the girl with the most total followers have?\nLet's think step by step\nAfter one week, Susy has 100+40 = 140 followers.\nIn the second week, Susy gains 40/2 = 20 new followers.\nIn the third week, Susy gains 20/2 = 10 new followers.\nIn total, Susy finishes the three weeks with 140+20+10 = 170 total followers.\nAfter one week, Sarah has 50+90 = 140 followers.\nAfter the second week, Sarah gains 90/3 = 30 followers.\nAfter the third week, Sarah gains 30/3 = 10 followers.\nSo, Sarah finishes the three weeks with 140+30+10 = 180 total followers.\nThus, Sarah is the girl with the most total followers with a total of 180.\nThe answer is 180\n\nQuestion: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
+    QUESTION = "Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
+
+    def __init__(self, *args, **kwargs):
+        super(LongLLMLinguaTester, self).__init__(*args, **kwargs)
+        self.llmlingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
+
+    def test_general_compress_prompt(self):
+        # Single Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n")[0],
+            question=self.QUESTION,
+            target_token=250,
+            condition_in_question="after_condition",
+            reorder_context="sort",
+            dynamic_context_compression_ratio=0.4,
+            condition_compare=True,
+            context_budget="+100",
+            rank_method="longllmlingua",
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_250TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 474)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 385)
+        self.assertEqual(compressed_prompt["ratio"], "1.2x")
+        self.assertEqual(compressed_prompt["rate"], "81.2%")
+
+        # Multiple Context
+        compressed_prompt = self.llmlingua.compress_prompt(
+            self.GSM8K_PROMPT.split("\n\n"),
+            question=self.QUESTION,
+            target_token=250,
+            condition_in_question="after_condition",
+            reorder_context="sort",
+            dynamic_context_compression_ratio=0.4,
+            condition_compare=True,
+            context_budget="+100",
+            rank_method="longllmlingua",
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.GSM8K_250TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 1094)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 474)
+        self.assertEqual(compressed_prompt["ratio"], "2.3x")
+        self.assertEqual(compressed_prompt["rate"], "43.3%")

From 7a570527ad8cf7e44b4127abae98b933dcaa508a Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Tue, 20 Feb 2024 14:35:55 +0000
Subject: [PATCH 13/19] Feature(LLMLingua): add pre commit check & unittest

---
 .github/ISSUE_TEMPLATE/bug_report.yml      |  2 +-
 .github/ISSUE_TEMPLATE/config.yml          |  2 +-
 .github/ISSUE_TEMPLATE/feature_request.yml |  2 +-
 .github/ISSUE_TEMPLATE/general_issue.yml   |  2 +-
 .github/workflows/release.yml              |  2 +-
 .gitignore                                 |  2 +-
 .pre-commit-config.yaml                    | 40 ++++-----
 DOCUMENT.md                                |  2 +-
 Makefile                                   |  2 +-
 README.md                                  | 26 +++---
 SECURITY.md                                |  2 +-
 SUPPORT.md                                 |  8 +-
 Transparency_FAQ.md                        | 42 ++++-----
 examples/CoT.ipynb                         | 42 ++++++---
 examples/Code.ipynb                        | 27 ++++--
 examples/OnlineMeeting.ipynb               | 37 +++++---
 examples/RAG.ipynb                         | 38 ++++++---
 examples/RAGLlamaIndex.ipynb               | 20 ++---
 examples/Retrieval.ipynb                   | 99 ++++++++++++++--------
 llmlingua/prompt_compressor.py             |  9 +-
 setup.cfg                                  |  2 +-
 setup.py                                   |  2 +-
 tests/test_llmlingua.py                    |  1 -
 tests/test_longllmlingua.py                |  1 -
 24 files changed, 248 insertions(+), 164 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 80f8c6e..d520d52 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -44,4 +44,4 @@ body:
         - Python Version: <!-- Specify the Python version (e.g., 3.8) -->
         - Related Issues: <!-- Link to any related issues here (e.g., #1) -->
         - Any other relevant information.
-      placeholder: Any additional details
\ No newline at end of file
+      placeholder: Any additional details
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index a49eab2..0086358 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1 @@
-blank_issues_enabled: true
\ No newline at end of file
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index 03994b3..5b7d7e8 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -23,4 +23,4 @@ body:
     attributes:
       label: Additional context
       description: Add any other context or screenshots about the feature request here.
-      placeholder: Any additional information
\ No newline at end of file
+      placeholder: Any additional information
diff --git a/.github/ISSUE_TEMPLATE/general_issue.yml b/.github/ISSUE_TEMPLATE/general_issue.yml
index 1aa6d79..f480312 100644
--- a/.github/ISSUE_TEMPLATE/general_issue.yml
+++ b/.github/ISSUE_TEMPLATE/general_issue.yml
@@ -9,4 +9,4 @@ body:
     attributes:
       label: Describe the issue
       description: A clear and concise description of what the question is.
-      placeholder: The detail of question.
\ No newline at end of file
+      placeholder: The detail of question.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8944a8a..687857a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -37,4 +37,4 @@ jobs:
           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
         shell: pwsh
-        run: twine upload dist/*
\ No newline at end of file
+        run: twine upload dist/*
diff --git a/.gitignore b/.gitignore
index b55e6a9..7508476 100644
--- a/.gitignore
+++ b/.gitignore
@@ -400,4 +400,4 @@ FodyWeavers.xsd
 
 # build
 build/*
-dist/*
\ No newline at end of file
+dist/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 012b522..0aa896d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,26 +26,26 @@ repos:
     rev: 23.3.0
     hooks:
     - id: black
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.261
-    hooks:
-      - id: ruff
-        args: ["--fix"]
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
-    hooks:
-      - id: codespell
-        args: ["-L", "ans,linar,nam,"]
-        exclude: |
-            (?x)^(
-              pyproject.toml |
-              website/static/img/ag.svg |
-              website/yarn.lock |
-              notebook/.*
-            )$
+  # - repo: https://github.com/charliermarsh/ruff-pre-commit
+  #   rev: v0.0.261
+    # hooks:
+    #   - id: ruff
+    #     args: ["--fix"]
+  # - repo: https://github.com/codespell-project/codespell
+  #   rev: v2.2.6
+    # hooks:
+    #   - id: codespell
+    #     args: ["-L", "ans,linar,nam,"]
+    #     exclude: |
+    #         (?x)^(
+    #           pyproject.toml |
+    #           website/static/img/ag.svg |
+    #           website/yarn.lock |
+    #           notebook/.*
+    #         )$
   - repo: https://github.com/nbQA-dev/nbQA
     rev: 1.7.1
     hooks:
-      - id: nbqa-ruff
-        args: ["--fix"]
-      - id: nbqa-black
\ No newline at end of file
+      # - id: nbqa-ruff
+      #   args: ["--fix"]
+      - id: nbqa-black
diff --git a/DOCUMENT.md b/DOCUMENT.md
index d2ad221..c99c2dd 100644
--- a/DOCUMENT.md
+++ b/DOCUMENT.md
@@ -145,7 +145,7 @@ recovered_response = llm_lingua.recover(
 
 ### Using phi-2
 
-Thanks to the efforts of the community, phi-2 is now available for use in LLMLingua. 
+Thanks to the efforts of the community, phi-2 is now available for use in LLMLingua.
 
 Before using it, please update your transformers to the GitHub version by running `pip install -U git+https://github.com/huggingface/transformers.git`.
 
diff --git a/Makefile b/Makefile
index 8bd4d6b..b0d90d2 100644
--- a/Makefile
+++ b/Makefile
@@ -13,4 +13,4 @@ style:
 	flake8 $(CHECK_DIRS)
 
 test:
-	@${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/
\ No newline at end of file
+	@${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/
diff --git a/README.md b/README.md
index 79e89fb..656013f 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
-<div style="display: flex; align-items: center;">  
-    <div style="width: 100px; margin-right: 10px; height:auto;" align="left">  
-        <img src="images/LLMLingua_logo.png" alt="LLMLingua" width="100" align="left">  
-    </div>  
-    <div style="flex-grow: 1;" align="center">  
-        <h2 align="center">(Long)LLMLingua: Enhancing Large Language Model Inference via Prompt Compression</h2>  
-    </div>  
+<div style="display: flex; align-items: center;">
+    <div style="width: 100px; margin-right: 10px; height:auto;" align="left">
+        <img src="images/LLMLingua_logo.png" alt="LLMLingua" width="100" align="left">
+    </div>
+    <div style="flex-grow: 1;" align="center">
+        <h2 align="center">(Long)LLMLingua: Enhancing Large Language Model Inference via Prompt Compression</h2>
+    </div>
 </div>
 
 <p align="center">
-    | <a href="https://llmlingua.com/"><b>Project Page</b></a> | 
-    <a href="https://arxiv.org/abs/2310.05736"><b>LLMLingua Paper</b></a> | 
-    <a href="https://arxiv.org/abs/2310.06839"><b>LongLLMLingua Paper</b></a> | 
+    | <a href="https://llmlingua.com/"><b>Project Page</b></a> |
+    <a href="https://arxiv.org/abs/2310.05736"><b>LLMLingua Paper</b></a> |
+    <a href="https://arxiv.org/abs/2310.06839"><b>LongLLMLingua Paper</b></a> |
     <a href="https://huggingface.co/spaces/microsoft/LLMLingua"><b>HF Space Demo</b></a> |
 </p>
 
@@ -102,7 +102,7 @@ To get started with (Long)LLMLingua, simply install it using pip:
 ```bash
 pip install llmlingua
 ```
-    
+
 #### 2. **Using (Long)LLMLingua for Prompt Compression:**
 
 With (Long)LLMLingua, you can easily compress your prompts. Here’s how you can do it:
@@ -152,8 +152,8 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
 
 ## Trademarks
 
-This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
-trademarks or logos is subject to and must follow 
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
 [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
 Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
 Any use of third-party trademarks or logos are subject to those third-party's policies.
diff --git a/SECURITY.md b/SECURITY.md
index e138ec5..9dc6316 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -14,7 +14,7 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at
 
 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
 
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
 
 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
 
diff --git a/SUPPORT.md b/SUPPORT.md
index 5a28ce6..effba63 100644
--- a/SUPPORT.md
+++ b/SUPPORT.md
@@ -1,13 +1,13 @@
 # Support
 
-## How to file issues and get help  
+## How to file issues and get help
 
-This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
-issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
 feature request as a new Issue.
 
 For help and questions about using this project, please refer the [document](./DOCUMENT.md).
 
-## Microsoft Support Policy  
+## Microsoft Support Policy
 
 Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
diff --git a/Transparency_FAQ.md b/Transparency_FAQ.md
index 9712c9d..74034e5 100644
--- a/Transparency_FAQ.md
+++ b/Transparency_FAQ.md
@@ -1,36 +1,36 @@
-# LLMLingua's Responsible AI FAQ 
+# LLMLingua's Responsible AI FAQ
 
-## What is LLMLingua? 
+## What is LLMLingua?
 
-- LLMLingua is a simple and efficient method to compress prompt up to 20x and keeping the original prompt knowledge like ICL, reasoning, etc. 
-- LLMLingua takes user-defined prompts and compression goals as input, and outputs a compressed prompt, which may often result in a form of expression that is difficult for humans to understand. 
+- LLMLingua is a simple and efficient method to compress prompt up to 20x and keeping the original prompt knowledge like ICL, reasoning, etc.
+- LLMLingua takes user-defined prompts and compression goals as input, and outputs a compressed prompt, which may often result in a form of expression that is difficult for humans to understand.
 
-## What can LLMLingua do?  
+## What can LLMLingua do?
 
-- LLMLingua can simultaneously reduce the length of prompts and the output of LLMs (20%-30%), thus saving API calls; 
-- Compressed prompts from LLMLingua can be directly used with black-box LLMs, such as ChatGPT, GPT-4, and Claude; 
-- By compressing prompts, LLMLingua allows for more information to be included within the original token length, thereby improving model performance; 
-- LLMLingua relies on a small language model, like GPT-2 or LLaMA-7b, for perplexity calculations, which is a relatively low-cost approach; 
-- Compressed prompts generated by LLMLingua can be understood by LLMs, preserving their original capabilities in downstream tasks and keeping the original prompt knowledge like ICL, reasoning, etc. LLMs can also recover the essential information from the compressed prompts; 
-- LLMLingua is a robustness method, no need any training for the LLMs; 
-- Additionally, LLMLingua can be used to compress KV-Cache, which speeds up inference. 
+- LLMLingua can simultaneously reduce the length of prompts and the output of LLMs (20%-30%), thus saving API calls;
+- Compressed prompts from LLMLingua can be directly used with black-box LLMs, such as ChatGPT, GPT-4, and Claude;
+- By compressing prompts, LLMLingua allows for more information to be included within the original token length, thereby improving model performance;
+- LLMLingua relies on a small language model, like GPT-2 or LLaMA-7b, for perplexity calculations, which is a relatively low-cost approach;
+- Compressed prompts generated by LLMLingua can be understood by LLMs, preserving their original capabilities in downstream tasks and keeping the original prompt knowledge like ICL, reasoning, etc. LLMs can also recover the essential information from the compressed prompts;
+- LLMLingua is a robustness method, no need any training for the LLMs;
+- Additionally, LLMLingua can be used to compress KV-Cache, which speeds up inference.
 
-## What is/are LLMLingua’s intended use(s)? 
+## What is/are LLMLingua’s intended use(s)?
 
-- Users who call black-box LLM APIs similar to GPT-4, those who utilize ChatGPT to handle longer content, as well as model deployers and cloud service providers, can benefit from these techniques. 
+- Users who call black-box LLM APIs similar to GPT-4, those who utilize ChatGPT to handle longer content, as well as model deployers and cloud service providers, can benefit from these techniques.
 
-## How was LLMLingua evaluated? What metrics are used to measure performance? 
+## How was LLMLingua evaluated? What metrics are used to measure performance?
 
-- In our experiments, we conducted a detailed evaluation of the performance of compressed prompts across various tasks, particularly in those involving LLM-specific capabilities, such as In-Context Learning, reasoning tasks, summarization, and conversation tasks. We assessed our approach using compression ratio and performance loss as evaluation metrics. 
+- In our experiments, we conducted a detailed evaluation of the performance of compressed prompts across various tasks, particularly in those involving LLM-specific capabilities, such as In-Context Learning, reasoning tasks, summarization, and conversation tasks. We assessed our approach using compression ratio and performance loss as evaluation metrics.
 
-## What are the limitations of LLMLingua? How can users minimize the impact of LLMLingua’s limitations when using the system? 
+## What are the limitations of LLMLingua? How can users minimize the impact of LLMLingua’s limitations when using the system?
 
-- The potential harmful, false or biased responses using the compressed prompts would likely be unchanged. Thus using LLMLingua has no inherent benefits or risks when it comes to those types of responsible AI issues. 
-- LLMLingua may struggle to perform well at particularly high compression ratios, especially when the original prompts are already quite short. 
+- The potential harmful, false or biased responses using the compressed prompts would likely be unchanged. Thus using LLMLingua has no inherent benefits or risks when it comes to those types of responsible AI issues.
+- LLMLingua may struggle to perform well at particularly high compression ratios, especially when the original prompts are already quite short.
 
-## What operational factors and settings allow for effective and responsible use of LLMLingua? 
+## What operational factors and settings allow for effective and responsible use of LLMLingua?
 
-- Users can set parameters such as the boundaries between different components (instruction, context, question) in the prompt, compression goals, and the small model used for compression calculations. Afterward, they can input the compressed prompt into black-box LLMs for use. 
+- Users can set parameters such as the boundaries between different components (instruction, context, question) in the prompt, compression goals, and the small model used for compression calculations. Afterward, they can input the compressed prompt into black-box LLMs for use.
 
 ## What is instruction, context, and question?
 
diff --git a/examples/CoT.ipynb b/examples/CoT.ipynb
index 9430631..336fed8 100644
--- a/examples/CoT.ipynb
+++ b/examples/CoT.ipynb
@@ -147,6 +147,7 @@
    "source": [
     "# Download the original prompt and dataset\n",
     "from datasets import load_dataset\n",
+    "\n",
     "!wget https://raw.githubusercontent.com/FranxYao/chain-of-thought-hub/main/gsm8k/lib_prompt/prompt_hardest.txt\n",
     "prompt_complex = open(\"./prompt_hardest.txt\").read()\n",
     "gsm8k = load_dataset(\"gsm8k\", \"main\")\n",
@@ -162,6 +163,7 @@
    "source": [
     "# Using the OAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\""
    ]
   },
@@ -174,10 +176,11 @@
    "source": [
     "# or Using the AOAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\"\n",
     "openai.api_base = \"https://xxxx.openai.azure.com/\"\n",
-    "openai.api_type = 'azure'\n",
-    "openai.api_version = '2023-05-15'"
+    "openai.api_type = \"azure\"\n",
+    "openai.api_version = \"2023-05-15\""
    ]
   },
   {
@@ -267,6 +270,7 @@
    "source": [
     "# The response from original prompt\n",
     "import json\n",
+    "\n",
     "instruction = \"Please reference the following examples to answer the math question,\\n\"\n",
     "prompt = instruction + prompt_complex + \"\\n\\nQuestion: \" + question\n",
     "\n",
@@ -328,6 +332,7 @@
    "source": [
     "# Setup LLMLingua\n",
     "from llmlingua import PromptCompressor\n",
+    "\n",
     "llm_lingua = PromptCompressor()"
    ]
   },
@@ -382,7 +387,9 @@
     ")\n",
     "\n",
     "instruction = \"Please reference the following examples to answer the math question,\\n\"\n",
-    "prompt = instruction + compressed_prompt[\"compressed_prompt\"] + \"\\n\\nQuestion: \" + question\n",
+    "prompt = (\n",
+    "    instruction + compressed_prompt[\"compressed_prompt\"] + \"\\n\\nQuestion: \" + question\n",
+    ")\n",
     "\n",
     "request_data = {\n",
     "    \"prompt\": prompt,\n",
@@ -418,6 +425,7 @@
    "source": [
     "import re\n",
     "\n",
+    "\n",
     "def extract_ans(ans_model):\n",
     "    ans_model = ans_model.split(\"\\n\")\n",
     "    ans = []\n",
@@ -431,6 +439,7 @@
     "    residual = \"\\n\".join(residual)\n",
     "    return ans, residual\n",
     "\n",
+    "\n",
     "def parse_pred_ans(filename):\n",
     "    with open(filename) as fd:\n",
     "        lines = fd.readlines()\n",
@@ -506,6 +515,7 @@
     "# Test in GSM8K test set\n",
     "from tqdm import tqdm\n",
     "import os\n",
+    "\n",
     "os.makedirs(\"outputs\", exist_ok=True)\n",
     "i = 0\n",
     "\n",
@@ -518,11 +528,20 @@
     "    iterative_size=100,\n",
     ")\n",
     "\n",
-    "for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), \n",
-    "                           total=len(gsm8k_test['question'])):\n",
-    "    instruction = \"Please reference the following examples to answer the math question,\\n\"\n",
-    "    prompt = instruction + compressed_prompt[\"compressed_prompt\"] + \"\\n\\nQuestion: \" + q + \"\\n\"\n",
-    "    \n",
+    "for q, a in tqdm(\n",
+    "    zip(gsm8k_test[\"question\"], gsm8k_test[\"answer\"]), total=len(gsm8k_test[\"question\"])\n",
+    "):\n",
+    "    instruction = (\n",
+    "        \"Please reference the following examples to answer the math question,\\n\"\n",
+    "    )\n",
+    "    prompt = (\n",
+    "        instruction\n",
+    "        + compressed_prompt[\"compressed_prompt\"]\n",
+    "        + \"\\n\\nQuestion: \"\n",
+    "        + q\n",
+    "        + \"\\n\"\n",
+    "    )\n",
+    "\n",
     "    request_data = {\n",
     "        \"prompt\": prompt,\n",
     "        \"max_tokens\": 400,\n",
@@ -537,8 +556,11 @@
     "    )\n",
     "    ans_model = response[\"choices\"][0][\"text\"]\n",
     "    ans_, residual = extract_ans(ans_model)\n",
-    "    with open('outputs/test_gpt_3.5_turbo_LLMLingua_174.txt', 'a') as fd:\n",
-    "        fd.write(\"Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n\" % (q, ans_.replace(\"Q:\", \"\").replace(\"A:\", \"\"), a))\n",
+    "    with open(\"outputs/test_gpt_3.5_turbo_LLMLingua_174.txt\", \"a\") as fd:\n",
+    "        fd.write(\n",
+    "            \"Q: %s\\nA_model:\\n%s\\nA:\\n%s\\n\\n\"\n",
+    "            % (q, ans_.replace(\"Q:\", \"\").replace(\"A:\", \"\"), a)\n",
+    "        )\n",
     "    i += 1"
    ]
   },
diff --git a/examples/Code.ipynb b/examples/Code.ipynb
index 7a09e03..cd3e5da 100644
--- a/examples/Code.ipynb
+++ b/examples/Code.ipynb
@@ -115,7 +115,8 @@
    "source": [
     "# Download the original prompt and dataset\n",
     "from datasets import load_dataset\n",
-    "dataset = load_dataset('THUDM/LongBench', \"repobench-p\", split='test')"
+    "\n",
+    "dataset = load_dataset(\"THUDM/LongBench\", \"repobench-p\", split=\"test\")"
    ]
   },
   {
@@ -127,6 +128,7 @@
    "source": [
     "# Using the OAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\""
    ]
   },
@@ -139,10 +141,11 @@
    "source": [
     "# or Using the AOAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\"\n",
     "openai.api_base = \"https://xxxx.openai.azure.com/\"\n",
-    "openai.api_type = 'azure'\n",
-    "openai.api_version = '2023-05-15'"
+    "openai.api_type = \"azure\"\n",
+    "openai.api_version = \"2023-05-15\""
    ]
   },
   {
@@ -161,7 +164,9 @@
    "outputs": [],
    "source": [
     "# select an example from MeetingBank\n",
-    "contexts, question, answer = [dataset[1][key] for key in [\"context\", \"input\", \"answers\"]]\n",
+    "contexts, question, answer = [\n",
+    "    dataset[1][key] for key in [\"context\", \"input\", \"answers\"]\n",
+    "]\n",
     "instruction = \"Please complete the code given below.\"\n",
     "question = question + \"\\n\\nNext line of code:\\n\""
    ]
@@ -233,6 +238,7 @@
    "source": [
     "# The response from original prompt, using GPT-4-32k\n",
     "import json\n",
+    "\n",
     "prompt = \"\\n\\n\".join([instruction, contexts, question])\n",
     "\n",
     "message = [\n",
@@ -296,6 +302,7 @@
    "source": [
     "# Setup LLMLingua\n",
     "from llmlingua import PromptCompressor\n",
+    "\n",
     "llm_lingua = PromptCompressor()"
    ]
   },
@@ -307,7 +314,9 @@
    "outputs": [],
    "source": [
     "contexts_list = contexts.split(\"\\n\")\n",
-    "contexts_list = [\"\\n\".join(contexts_list[ii: ii + 4]) for ii in range(0, len(contexts_list), 4)]"
+    "contexts_list = [\n",
+    "    \"\\n\".join(contexts_list[ii : ii + 4]) for ii in range(0, len(contexts_list), 4)\n",
+    "]"
    ]
   },
   {
@@ -359,12 +368,12 @@
     "    question=question,\n",
     "    target_token=2000,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=False,\n",
     "    context_budget=\"+100\",\n",
-    "    dynamic_context_compression_ratio=0.4, # enable dynamic_context_compression_ratio\n",
-    "    reorder_context=\"sort\"\n",
+    "    dynamic_context_compression_ratio=0.4,  # enable dynamic_context_compression_ratio\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
diff --git a/examples/OnlineMeeting.ipynb b/examples/OnlineMeeting.ipynb
index 48bb81e..016d292 100644
--- a/examples/OnlineMeeting.ipynb
+++ b/examples/OnlineMeeting.ipynb
@@ -115,6 +115,7 @@
    "source": [
     "# Download the original prompt and dataset\n",
     "from datasets import load_dataset\n",
+    "\n",
     "dataset = load_dataset(\"lytang/MeetingBank-transcript\")[\"train\"]"
    ]
   },
@@ -127,6 +128,7 @@
    "source": [
     "# Using the OAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\""
    ]
   },
@@ -139,10 +141,11 @@
    "source": [
     "# or Using the AOAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\"\n",
     "openai.api_base = \"https://xxxx.openai.azure.com/\"\n",
-    "openai.api_type = 'azure'\n",
-    "openai.api_version = '2023-05-15'"
+    "openai.api_type = \"azure\"\n",
+    "openai.api_version = \"2023-05-15\""
    ]
   },
   {
@@ -220,6 +223,7 @@
    "source": [
     "# The response from original prompt, using GPT-4-32k\n",
     "import json\n",
+    "\n",
     "prompt = \"\\n\\n\".join([contexts, question])\n",
     "\n",
     "message = [\n",
@@ -275,6 +279,7 @@
    "source": [
     "# Setup LLMLingua\n",
     "from llmlingua import PromptCompressor\n",
+    "\n",
     "llm_lingua = PromptCompressor()"
    ]
   },
@@ -327,12 +332,12 @@
     "    question=question,\n",
     "    target_token=200,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=False,\n",
     "    context_budget=\"+100\",\n",
-    "    dynamic_context_compression_ratio=0.4, # enable dynamic_context_compression_ratio\n",
-    "    reorder_context=\"sort\"\n",
+    "    dynamic_context_compression_ratio=0.4,  # enable dynamic_context_compression_ratio\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
@@ -411,6 +416,7 @@
    "source": [
     "# The response from original prompt, using GPT-4-32k\n",
     "import json\n",
+    "\n",
     "prompt = \"\\n\\n\".join([contexts, question])\n",
     "\n",
     "message = [\n",
@@ -481,11 +487,11 @@
     "    question=question,\n",
     "    target_token=200,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=True,\n",
     "    context_budget=\"+100\",\n",
-    "    reorder_context=\"sort\"\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
@@ -523,7 +529,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "question = \"Question: what are the arrangements the Police Department will make this year?\"\n",
+    "question = (\n",
+    "    \"Question: what are the arrangements the Police Department will make this year?\"\n",
+    ")\n",
     "reference = \"enhancing community engagement and internal communication models, building a culture of accountability and transparency, and prioritizing recruitment and retention.\""
    ]
   },
@@ -564,6 +572,7 @@
    "source": [
     "# The response from original prompt, using GPT-4-32k\n",
     "import json\n",
+    "\n",
     "prompt = \"\\n\\n\".join([contexts, question])\n",
     "\n",
     "message = [\n",
@@ -634,12 +643,12 @@
     "    question=question,\n",
     "    target_token=2000,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=False,\n",
     "    context_budget=\"+100\",\n",
-    "    dynamic_context_compression_ratio=0.4, # enable dynamic_context_compression_ratio\n",
-    "    reorder_context=\"sort\"\n",
+    "    dynamic_context_compression_ratio=0.4,  # enable dynamic_context_compression_ratio\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
diff --git a/examples/RAG.ipynb b/examples/RAG.ipynb
index f585f97..827f418 100644
--- a/examples/RAG.ipynb
+++ b/examples/RAG.ipynb
@@ -131,6 +131,7 @@
    "source": [
     "# Using the OAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\""
    ]
   },
@@ -143,10 +144,11 @@
    "source": [
     "# or Using the AOAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\"\n",
     "openai.api_base = \"https://xxxx.openai.azure.com/\"\n",
-    "openai.api_type = 'azure'\n",
-    "openai.api_version = '2023-05-15'"
+    "openai.api_type = \"azure\"\n",
+    "openai.api_version = \"2023-05-15\""
    ]
   },
   {
@@ -178,7 +180,6 @@
     "from tqdm import tqdm\n",
     "from lost_in_the_middle.prompting import (\n",
     "    Document,\n",
-    "    get_closedbook_qa_prompt,\n",
     "    get_qa_prompt,\n",
     ")\n",
     "\n",
@@ -202,7 +203,15 @@
     "        c = prompt.split(\"\\n\\n\")\n",
     "        instruction, question = c[0], c[-1]\n",
     "        demonstration = \"\\n\".join(c[1:-1])\n",
-    "        datasets.append({\"id\": ii, \"instruction\": instruction, \"demonstration\": demonstration, \"question\": question, \"answer\": input_example[\"answers\"]})"
+    "        datasets.append(\n",
+    "            {\n",
+    "                \"id\": ii,\n",
+    "                \"instruction\": instruction,\n",
+    "                \"demonstration\": demonstration,\n",
+    "                \"question\": question,\n",
+    "                \"answer\": input_example[\"answers\"],\n",
+    "            }\n",
+    "        )"
    ]
   },
   {
@@ -213,7 +222,9 @@
    "outputs": [],
    "source": [
     "# select an example from NaturalQuestions\n",
-    "instruction, demonstration_str, question, answer = [datasets[23][key] for key in [\"instruction\", \"demonstration\", \"question\", \"answer\"]]"
+    "instruction, demonstration_str, question, answer = [\n",
+    "    datasets[23][key] for key in [\"instruction\", \"demonstration\", \"question\", \"answer\"]\n",
+    "]"
    ]
   },
   {
@@ -345,6 +356,7 @@
    "source": [
     "# Setup LLMLingua\n",
     "from llmlingua import PromptCompressor\n",
+    "\n",
     "llm_lingua = PromptCompressor()"
    ]
   },
@@ -397,12 +409,12 @@
     "    question=question,\n",
     "    target_token=500,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=False,\n",
     "    context_budget=\"+100\",\n",
-    "    dynamic_context_compression_ratio=0.4, # enable dynamic_context_compression_ratio\n",
-    "    reorder_context=\"sort\"\n",
+    "    dynamic_context_compression_ratio=0.4,  # enable dynamic_context_compression_ratio\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
@@ -474,12 +486,12 @@
     "    question=question,\n",
     "    target_token=100,\n",
     "    condition_compare=True,\n",
-    "    condition_in_question='after',\n",
-    "    rank_method='longllmlingua',\n",
+    "    condition_in_question=\"after\",\n",
+    "    rank_method=\"longllmlingua\",\n",
     "    use_sentence_level_filter=False,\n",
     "    context_budget=\"+100\",\n",
-    "    dynamic_context_compression_ratio=0.4, # enable dynamic_context_compression_ratio\n",
-    "    reorder_context=\"sort\"\n",
+    "    dynamic_context_compression_ratio=0.4,  # enable dynamic_context_compression_ratio\n",
+    "    reorder_context=\"sort\",\n",
     ")\n",
     "message = [\n",
     "    {\"role\": \"user\", \"content\": compressed_prompt[\"compressed_prompt\"]},\n",
diff --git a/examples/RAGLlamaIndex.ipynb b/examples/RAGLlamaIndex.ipynb
index 2a007fb..56c56d4 100644
--- a/examples/RAGLlamaIndex.ipynb
+++ b/examples/RAGLlamaIndex.ipynb
@@ -13,8 +13,8 @@
    "id": "05d999bc-83a3-454f-a8a4-44cbff1fcedc",
    "metadata": {},
    "source": [
-    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/microsoft/LLMLingua/blob/main/examples/RAGLlamaIndex.ipynb\">\r\n",
-    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\r\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/microsoft/LLMLingua/blob/main/examples/RAGLlamaIndex.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>"
    ]
   },
@@ -198,6 +198,7 @@
    "source": [
     "# Using the OAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\""
    ]
   },
@@ -210,10 +211,11 @@
    "source": [
     "# or Using the AOAI\n",
     "import openai\n",
+    "\n",
     "openai.api_key = \"<insert_openai_key>\"\n",
     "openai.api_base = \"https://xxxx.openai.azure.com/\"\n",
-    "openai.api_type = 'azure'\n",
-    "openai.api_version = '2023-05-15'"
+    "openai.api_type = \"azure\"\n",
+    "openai.api_version = \"2023-05-15\""
    ]
   },
   {
@@ -271,8 +273,6 @@
     "from llama_index import (\n",
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    load_index_from_storage,\n",
-    "    StorageContext,\n",
     ")"
    ]
   },
@@ -284,9 +284,7 @@
    "outputs": [],
    "source": [
     "# load documents\n",
-    "documents = SimpleDirectoryReader(\n",
-    "    input_files=[\"paul_graham_essay.txt\"]\n",
-    ").load_data()"
+    "documents = SimpleDirectoryReader(input_files=[\"paul_graham_essay.txt\"]).load_data()"
    ]
   },
   {
@@ -518,7 +516,7 @@
       "\n",
       "Original Tokens: 10719\n",
       "Compressed Tokens: 308\n",
-      "Comressed Ratio: 34.80x\n"
+      "Compressed Ratio: 34.80x\n"
      ]
     }
    ],
@@ -533,7 +531,7 @@
     "print()\n",
     "print(\"Original Tokens:\", original_tokens)\n",
     "print(\"Compressed Tokens:\", compressed_tokens)\n",
-    "print(\"Comressed Ratio:\", f\"{original_tokens/(compressed_tokens + 1e-5):.2f}x\")"
+    "print(\"Compressed Ratio:\", f\"{original_tokens/(compressed_tokens + 1e-5):.2f}x\")"
    ]
   },
   {
diff --git a/examples/Retrieval.ipynb b/examples/Retrieval.ipynb
index d5fe2c1..b960a23 100644
--- a/examples/Retrieval.ipynb
+++ b/examples/Retrieval.ipynb
@@ -144,7 +144,7 @@
    "id": "f8676ffa-5117-44dc-9742-bb9ab1d56e0c",
    "metadata": {},
    "source": [
-    "## Evaulate the Reranker Performance"
+    "## Evaluate the Reranker Performance"
    ]
   },
   {
@@ -156,6 +156,7 @@
    "source": [
     "# Setup LLMLingua\n",
     "from llmlingua import PromptCompressor\n",
+    "\n",
     "llm_lingua = PromptCompressor()"
    ]
   },
@@ -174,10 +175,10 @@
     "\n",
     "from lost_in_the_middle.prompting import (\n",
     "    Document,\n",
-    "    get_closedbook_qa_prompt,\n",
     "    get_qa_prompt,\n",
     ")\n",
     "\n",
+    "\n",
     "def get_reranker_results(rank_method):\n",
     "    path = \"lost-in-the-middle/qa_data/20_total_documents/nq-open-20_total_documents_gold_at_9.jsonl.gz\"\n",
     "    d_idx = 9\n",
@@ -189,31 +190,55 @@
     "            documents = []\n",
     "            for ctx in deepcopy(input_example[\"ctxs\"]):\n",
     "                documents.append(Document.from_dict(ctx))\n",
-    "    \n",
+    "\n",
     "            prompt = get_qa_prompt(\n",
     "                question,\n",
     "                documents,\n",
     "                mention_random_ordering=False,\n",
     "                query_aware_contextualization=False,\n",
     "            )\n",
-    "    \n",
+    "\n",
     "            c = prompt.split(\"\\n\\n\")\n",
     "            instruction, question = c[0], c[-1]\n",
     "            demonstration = \"\\n\".join(c[1:-1])\n",
     "            corpus = demonstration.split(\"\\n\")\n",
-    "    \n",
-    "            idx = llm_lingua.get_rank_results(corpus, question, rank_method, \"none\" if rank_method == \"llmlingua\" else \"after\", [0] * 20)\n",
+    "\n",
+    "            idx = llm_lingua.get_rank_results(\n",
+    "                corpus,\n",
+    "                question,\n",
+    "                rank_method,\n",
+    "                \"none\" if rank_method == \"llmlingua\" else \"after\",\n",
+    "                [0] * 20,\n",
+    "            )\n",
     "            idx = [ii[0] for ii in idx].index(d_idx)\n",
     "            res.append(idx)\n",
     "    logs = [rank_method]\n",
     "    for idx in range(1, 21):\n",
     "        acc = len([ii for ii in res if ii < idx]) / len(res) * 100\n",
-    "        print(\"R@{},{:.2f}\".format(idx, len([ii for ii in res if ii < idx]) / len(res) * 100))\n",
+    "        print(\n",
+    "            \"R@{},{:.2f}\".format(\n",
+    "                idx, len([ii for ii in res if ii < idx]) / len(res) * 100\n",
+    "            )\n",
+    "        )\n",
     "        logs.append(\"{:.2f}\".format(acc))\n",
     "    with open(\"retrieval.csv\", \"a\") as f:\n",
     "        f.write(\",\".join(logs) + \"\\n\")\n",
     "\n",
-    "for rank_method in [\"bm25\", \"gzip\", \"sentbert\", \"openai\", \"bge\", \"bge_reranker\", \"bge_llmembedder\", \"jinza\", \"voyageai\", \"cohere\", \"llmlingua\", \"longllmlingua\"]:\n",
+    "\n",
+    "for rank_method in [\n",
+    "    \"bm25\",\n",
+    "    \"gzip\",\n",
+    "    \"sentbert\",\n",
+    "    \"openai\",\n",
+    "    \"bge\",\n",
+    "    \"bge_reranker\",\n",
+    "    \"bge_llmembedder\",\n",
+    "    \"jinza\",\n",
+    "    \"voyageai\",\n",
+    "    \"cohere\",\n",
+    "    \"llmlingua\",\n",
+    "    \"longllmlingua\",\n",
+    "]:\n",
     "    get_reranker_results(rank_method)"
    ]
   },
@@ -234,7 +259,9 @@
    "source": [
     "recall_str = open(\"retrieval.csv\").read()\n",
     "recall_list = [ii.split(\"\\t\") for ii in recall_str.split(\"\\n\\n\")]\n",
-    "recall_list_data = [[ii[0], j + 1, float(k)] for ii in recall_list for j, k in enumerate(ii[1:])]"
+    "recall_list_data = [\n",
+    "    [ii[0], j + 1, float(k)] for ii in recall_list for j, k in enumerate(ii[1:])\n",
+    "]"
    ]
   },
   {
@@ -255,52 +282,58 @@
     }
    ],
    "source": [
-    "import joblib\n",
     "import pandas as pd\n",
     "import seaborn as sns\n",
     "import matplotlib.pyplot as plt\n",
-    "import math\n",
-    "import os\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "from matplotlib.patches import Patch\n",
-    "from matplotlib.lines import Line2D\n",
     "\n",
     "sns.set_theme(\"poster\", style=\"darkgrid\", font_scale=1.2)\n",
-    "plt.rcParams['pdf.fonttype'] = 42\n",
-    "plt.rcParams['ps.fonttype'] = 42\n",
+    "plt.rcParams[\"pdf.fonttype\"] = 42\n",
+    "plt.rcParams[\"ps.fonttype\"] = 42\n",
     "\n",
     "# plt.rc('font', family=\"Times New Roman\", size=30) #controls default text size\n",
-    "plt.rc('axes', titlesize=40) #fontsize of the title\n",
-    "plt.rc('axes', labelsize=40) #fontsize of the x and y labels\n",
-    "plt.rc('xtick', labelsize=40) #fontsize of the x tick labels\n",
-    "plt.rc('ytick', labelsize=40) #fontsize of the y tick labels\n",
-    "plt.rc('legend', fontsize=25) #fontsize of the legend\n",
+    "plt.rc(\"axes\", titlesize=40)  # fontsize of the title\n",
+    "plt.rc(\"axes\", labelsize=40)  # fontsize of the x and y labels\n",
+    "plt.rc(\"xtick\", labelsize=40)  # fontsize of the x tick labels\n",
+    "plt.rc(\"ytick\", labelsize=40)  # fontsize of the y tick labels\n",
+    "plt.rc(\"legend\", fontsize=25)  # fontsize of the legend\n",
     "\n",
-    "plt.figure(figsize=(14,10))\n",
+    "plt.figure(figsize=(14, 10))\n",
     "pq = pd.DataFrame(recall_list_data)\n",
     "pq.columns = [\"Method\", \"k\", \"R@k\"]\n",
     "\n",
-    "methods = [\"LLMLingua\", \"BM25\", \"OpenAI\", \"Voageai\", \"BPE-large-en v1.5\", \"SBERT\", \"Gzip\", \"Cohere-Rerank\", \"BGE-llmembeder\",\"Jina\", \"LongLLMLingua $r_k$ \\nw/o restrict\", \"BGE-Ranker-large\",\"LongLLMLingua $r_k$\"]\n",
+    "methods = [\n",
+    "    \"LLMLingua\",\n",
+    "    \"BM25\",\n",
+    "    \"OpenAI\",\n",
+    "    \"Voageai\",\n",
+    "    \"BPE-large-en v1.5\",\n",
+    "    \"SBERT\",\n",
+    "    \"Gzip\",\n",
+    "    \"Cohere-Rerank\",\n",
+    "    \"BGE-llmembeder\",\n",
+    "    \"Jina\",\n",
+    "    \"LongLLMLingua $r_k$ \\nw/o restrict\",\n",
+    "    \"BGE-Ranker-large\",\n",
+    "    \"LongLLMLingua $r_k$\",\n",
+    "]\n",
     "colors = [\"4\", \"1\", \"2\", \"2\", \"2\", \"2\", \"1\", \"3\", \"3\", \"2\", \"5\", \"3\", \"5\"]\n",
-    "markers = ['>', '^', 's', 'o', \"<\", \".\", \"*\", '>', '^', 's', 'o', \"<\", \".\", \"*\"]\n",
+    "markers = [\">\", \"^\", \"s\", \"o\", \"<\", \".\", \"*\", \">\", \"^\", \"s\", \"o\", \"<\", \".\", \"*\"]\n",
     "for m, c, ma in zip(methods, colors, markers):\n",
     "    plt.plot(\n",
     "        pq[pq[\"Method\"] == m][\"k\"],\n",
-    "        pq[pq[\"Method\"] == m]['R@k'],\n",
+    "        pq[pq[\"Method\"] == m][\"R@k\"],\n",
     "        alpha=0.65,\n",
-    "        color=f'C{c}',\n",
-    "#         marker=ma,\n",
+    "        color=f\"C{c}\",\n",
+    "        #         marker=ma,\n",
     "        label=m,\n",
     "        linewidth=7,\n",
     "        markersize=5 if ma == \"*\" else 5,\n",
     "    )\n",
     "\n",
-    "plt.xlabel('Number of Retained Documents')\n",
-    "plt.ylabel(f'Recall(%)')\n",
+    "plt.xlabel(\"Number of Retained Documents\")\n",
+    "plt.ylabel(\"Recall(%)\")\n",
     "plt.xticks([1, 5, 10, 15, 20], labels=[\"1\", \"5\", \"10\", \"15\", \"20\"])\n",
-    "plt.legend(loc='lower right')\n",
+    "plt.legend(loc=\"lower right\")\n",
     "plt.tight_layout()\n",
     "# plt.savefig(\"retrieval.pdf\", dpi=1000)\n",
     "\n",
diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index 045f57f..c9bfa1b 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -266,9 +266,12 @@ def structured_compress_prompt(
             )
         else:
             rate = target_token / sum(context_tokens_length)
-        context, context_segs, context_segs_rate, context_segs_compress = (
-            self.segment_structured_context(context, rate)
-        )
+        (
+            context,
+            context_segs,
+            context_segs_rate,
+            context_segs_compress,
+        ) = self.segment_structured_context(context, rate)
         return self.compress_prompt(
             context,
             instruction,
diff --git a/setup.cfg b/setup.cfg
index 21266a7..d9f2745 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -25,4 +25,4 @@ use_parentheses = True
 
 [flake8]
 ignore = E203, E501, E741, W503, W605
-max-line-length = 119
\ No newline at end of file
+max-line-length = 119
diff --git a/setup.py b/setup.py
index eac19b6..4f81a50 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2023 Microsoft
 # Licensed under The MIT License [see LICENSE for details]
- 
+
 from setuptools import find_packages, setup
 
 # PEP0440 compatible formatted version, see:
diff --git a/tests/test_llmlingua.py b/tests/test_llmlingua.py
index 687540e..3daf3a4 100644
--- a/tests/test_llmlingua.py
+++ b/tests/test_llmlingua.py
@@ -1,5 +1,4 @@
 import unittest
-import unittest.mock as mock
 
 from llmlingua import PromptCompressor
 
diff --git a/tests/test_longllmlingua.py b/tests/test_longllmlingua.py
index 83808ea..6dccd5f 100644
--- a/tests/test_longllmlingua.py
+++ b/tests/test_longllmlingua.py
@@ -1,5 +1,4 @@
 import unittest
-import unittest.mock as mock
 
 from llmlingua import PromptCompressor
 

From b68c3a35e44b3be6c98bdf7732b29fb00a153b3d Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Tue, 20 Feb 2024 14:47:42 +0000
Subject: [PATCH 14/19] Fix(LLMLingua): fix the unittest

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4f81a50..deefefe 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@
 
 INSTALL_REQUIRES = [
     "transformers>=4.26.0",
+    "accelerate",
     "torch",
     "tiktoken",
     "nltk",
@@ -45,7 +46,7 @@
     author="The LLMLingua team",
     author_email="hjiang@microsoft.com",
     description="To speed up LLMs' inference and enhance LLM's perceive of key information, compress the prompt and KV-Cache, which achieves up to 20x compression with minimal performance loss.",
-    long_description=open("README.md").read(),
+    long_description=open("README.md", encoding="utf8").read(),
     long_description_content_type="text/markdown",
     keywords="Prompt Compression, LLMs, Inference Acceleration, Black-box LLMs, Efficient LLMs",
     license="MIT License",

From 6a2dcdd5f9e1df74302a8989cc03aa36e9562923 Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Tue, 20 Feb 2024 14:51:56 +0000
Subject: [PATCH 15/19] Fix(LLMLingua): fix the unitest

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index deefefe..ae0f4fb 100644
--- a/setup.py
+++ b/setup.py
@@ -57,8 +57,8 @@
         "Programming Language :: Python :: 3",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    package_dir={"": "./"},
-    packages=find_packages("./"),
+    package_dir={"": "."},
+    packages=find_packages("."),
     extras_require={
         "dev": DEV_REQUIRES,
         "quality": QUANLITY_REQUIRES,

From 61e7e510f3115457f5a0857c4be6114f0dd37133 Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Tue, 20 Feb 2024 15:01:33 +0000
Subject: [PATCH 16/19] Fix(LLMLingua): fix the unitest

---
 .github/workflows/unittest.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index a2bfcf4..165e078 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -21,11 +21,10 @@ jobs:
         python-version: ["3.9", "3.10", "3.11"]
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        name: Setup python ${{ inputs.python-version }}
-        id: setup-python
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: ${{ inputs.python-version }}
+          python-version: ${{ matrix.python-version }}
 
       - name: Install packages and dependencies for all tests
         run: |

From b3b74769aa7e43c65220e01e2819a2591a2abfaa Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Wed, 21 Feb 2024 06:14:05 +0000
Subject: [PATCH 17/19] Feature(LLMLingua): update the release script

---
 .github/workflows/release.yml | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 687857a..665ba25 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -17,10 +17,18 @@ jobs:
         os: ['ubuntu-latest']
         python-version: [3.10]
     runs-on: ${{ matrix.os }}
-    environment: package
+    environment:
+      name: pypi
+      url: https://pypi.org/project/llmlingua/
+    permissions:
+      id-token: write
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
 
       - name: Install from source
         # This is required for the pre-commit tests
@@ -32,9 +40,7 @@ jobs:
         run: |
           pip install twine
           python setup.py sdist bdist_wheel
-      - name: Publish to PyPI
-        env:
-          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
-        shell: pwsh
-        run: twine upload dist/*
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          print-hash: true

From 049a113087577fe2354872c5f9a65b335f62ff01 Mon Sep 17 00:00:00 2001
From: Huiqiang Jiang <iofu728@163.com>
Date: Mon, 26 Feb 2024 07:28:38 +0000
Subject: [PATCH 18/19] Feature(LLMLingua): add HF_TOKEN

---
 .github/workflows/unittest.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 165e078..a10ada6 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -12,7 +12,7 @@ defaults:
 permissions: {}
 
 jobs:
-  UniTest:
+  UnitTest:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -37,5 +37,7 @@ jobs:
 
       - name: Run core tests
         shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN}}
         run: |
           make test

From 9f97ba7c2daf4a34b87843c4ba6f579474e84b9d Mon Sep 17 00:00:00 2001
From: SiyunZhao <49901104+SiyunZhao@users.noreply.github.com>
Date: Wed, 28 Feb 2024 12:52:50 +0800
Subject: [PATCH 19/19] Added unittest for structured_compress_prompt and fixed
 bugs (#95)

- fix bugs and add unittests
- make style
- add nltk init
- fix nltk file exist error
- add unittest for different models

Co-authored-by: Siyun Zhao <siyunzhao@microsoft.com>
Co-authored-by: Qianhui Wu <wuqh_thu@foxmail.com>
Co-authored-by: Xufang Luo <34053802+XufangLuo@users.noreply.github.com>
Co-authored-by: Yuqing Yang <justin.yqyang@gmail.com>
---
 llmlingua/prompt_compressor.py |  70 +++++++++++-----------
 tests/test_llmlingua.py        |  85 +++++++++++++++++++++++++++
 tests/test_longllmlingua.py    | 102 +++++++++++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 32 deletions(-)

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
index c9bfa1b..9d5b757 100644
--- a/llmlingua/prompt_compressor.py
+++ b/llmlingua/prompt_compressor.py
@@ -248,7 +248,10 @@ def structured_compress_prompt(
             context = [" "]
         if isinstance(context, str):
             context = [context]
-
+        context = [
+            self.tokenizer.decode(self.tokenizer(c, add_special_tokens=False).input_ids)
+            for c in context
+        ]
         context_tokens_length = [self.get_token_length(c) for c in context]
         instruction_tokens_length, question_tokens_length = self.get_token_length(
             instruction
@@ -488,7 +491,7 @@ def compress_prompt(
         if condition_flag:
             prefix = question + "\n\n" + instruction if add_instruction else question
             if (
-                self.get_token_length(prefix) + 2 + iterative_size * 2
+                self.get_token_length(prefix + "\n\n") + iterative_size * 2
                 > self.max_position_embeddings
             ):
                 tokens = self.tokenizer(prefix, add_special_tokens=False).input_ids
@@ -502,7 +505,7 @@ def compress_prompt(
                         + 2 * iterative_size :
                     ]
                 )
-            start = self.get_token_length(prefix) + 2
+            start = self.get_prefix_length(prefix + "\n\n", context[0])
             context = [prefix] + context
         else:
             start = 0
@@ -556,6 +559,18 @@ def get_token_length(self, text: str, add_special_tokens: bool = True):
             self.tokenizer(text, add_special_tokens=add_special_tokens).input_ids
         )
 
+    def get_prefix_length(self, prefix: str, text: str):
+        possible_prefix_token = max(self.get_token_length(prefix, False) - 3, 1)
+        full_input_ids = self.tokenizer(
+            prefix + text[:100], add_special_tokens=False
+        ).input_ids
+        for i in range(possible_prefix_token, len(full_input_ids)):
+            cur_prefix = self.tokenizer.decode(full_input_ids[:i])
+            if cur_prefix == prefix:
+                break
+        assert self.tokenizer.decode(full_input_ids[i:]) == text[:100]
+        return i
+
     def get_condition_ppl(
         self,
         text: str,
@@ -633,9 +648,11 @@ def get_structured_dynamic_compression_ratio(
         seg_info: List[List[tuple]] = None,
     ):
         if start:
-            context = context[1:]
-        global_dynamic_rate, global_dynamic_compress, tmp_context = [], [], []
-        for context_idx, text in enumerate(context):
+            pure_context = context[1:]
+        else:
+            pure_context = context
+        global_dynamic_rate, global_dynamic_compress, segments = [], [], []
+        for context_idx, text in enumerate(pure_context):
             text_seen = 0
             for seg_idx, (seg_len, seg_rate, seg_compress) in enumerate(
                 seg_info[context_idx]
@@ -643,22 +660,28 @@ def get_structured_dynamic_compression_ratio(
                 seg_text = text[text_seen : text_seen + seg_len]
                 if (
                     seg_idx == len(seg_info[context_idx]) - 1
-                    and context_idx != len(context) - 1
+                    and context_idx != len(pure_context) - 1
                 ):
                     seg_text += "\n\n"
-                tmp_context.append(seg_text)
+                segments.append(seg_text)
                 if seg_compress:
                     global_dynamic_rate.append(seg_rate)
                 else:
                     global_dynamic_rate.append(1.0)
                 global_dynamic_compress.append(seg_compress)
                 text_seen += seg_len
-        origin_text = "\n\n".join(context)
-        assert len("".join(tmp_context)) == len(origin_text)
+        origin_text = "\n\n".join(pure_context)
+        assert len("".join(segments)) == len(origin_text)
+        assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
+
+        text_input_ids = self.tokenizer(
+            "\n\n".join(context), add_special_tokens=False
+        ).input_ids[start:]
+        assert self.tokenizer.decode(text_input_ids) == origin_text
         dynamic_compression_ratio = self.token_segment(
-            origin_text,
+            text_input_ids,
             iterative_size,
-            tmp_context,
+            segments,
             global_dynamic_rate,
             global_dynamic_compress,
         )
@@ -666,15 +689,12 @@ def get_structured_dynamic_compression_ratio(
 
     def token_segment(
         self,
-        text: str,
+        text_input_ids: List[int],
         iterative_size: int,
         segments: List[str],
         global_dynamic_rate: List[float],
         global_dynamic_compress: List[bool],
     ):
-        assert len(segments) == len(global_dynamic_rate) == len(global_dynamic_compress)
-        assert text == "".join(segments)
-        text_input_ids = self.tokenizer(text, add_special_tokens=False).input_ids
         decode_window = 3
         seg_idx, seg_seen, token_seen_num, last_rate = 0, 0, 0, -1
         dynamic_compression_rate, local_compresssion_rate = [], []
@@ -953,26 +973,12 @@ def sync_sentence(segments, text):
         new_segments_info = []
         for s in sentences:
             tmp = [jj for ii, jj in enumerate(s) if sentence_flags[idx + ii]]
-            res.append("\n".join(tmp))
+            res.append("".join(tmp))
             if context_segs is not None:
                 segment_ratio = []
                 for ii in range(len(s)):
                     if sentence_flags[idx + ii]:
-                        last_element = (
-                            sen2seg_ratio[idx + ii][-1][0] + 1,
-                            sen2seg_ratio[idx + ii][-1][1],
-                            sen2seg_ratio[idx + ii][-1][2],
-                        )
-                        segment_ratio.extend(
-                            sen2seg_ratio[idx + ii][:-1] + [last_element]
-                        )
-                segment_ratio = segment_ratio[:-1] + [
-                    (
-                        segment_ratio[-1][0] - 1,
-                        segment_ratio[-1][1],
-                        segment_ratio[-1][2],
-                    )
-                ]
+                        segment_ratio.extend(sen2seg_ratio[idx + ii])
                 new_segments_info.append(segment_ratio)
             idx += len(s)
         if context_segs is not None:
diff --git a/tests/test_llmlingua.py b/tests/test_llmlingua.py
index 3daf3a4..3673525 100644
--- a/tests/test_llmlingua.py
+++ b/tests/test_llmlingua.py
@@ -11,9 +11,54 @@ class LLMLinguaTester(unittest.TestCase):
     GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
     GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo and Melanie to plan how many hours they should together their test have 2 their textbook and 4 to They out should and 1 hours. they study, many they study total week they a break every hour, include 3minute and lunch day\n's think step\n Melanie should the chapters hours 2 = hours\n the to dedicate x\n Melanie to with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4"
     GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "Question: You can buy 4 apples or 1 for. You bought 36 fruits evenly split between, waterons and of 1 orange $.. much does cost if your total bill $\n's think step\nIf were between 3 of, then I 36/3 = 12 of fruitIf 1 orange50 then oranges50 * $If66 I $ oranges I $66 $60 on the other 2 fruit\nAssuming the of is W, and that you price and of is then 1W=4AIf we know we bought 12 and, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
+    JSON_PROMPT = """<llmlingua, compress=False>
+    {
+        "id": "</llmlingua><llmlingua, rate=0.8>987654</llmlingua><llmlingua, compress=False>",
+        "name": "</llmlingua><llmlingua, rate=0.8>John Doe</llmlingua><llmlingua, compress=False>",
+        "isActive": "</llmlingua><llmlingua, rate=0.8>true</llmlingua><llmlingua, compress=False>",
+        "biography": "</llmlingua><llmlingua, rate=0.4>John Doe, born in New York in 1985, is a renowned software engineer with over 10 years of experience in the field. John graduated from MIT with a degree in Computer Science and has since worked with several Fortune 500 companies. He has a passion for developing innovative software solutions and has contributed to numerous open source projects. John is also an avid writer and speaker at tech conferences, sharing his insights on emerging technologies and their impact on the business world. In his free time, John enjoys hiking, reading science fiction novels, and playing the piano.</llmlingua><llmlingua, compress=False>",
+        "employmentHistory": [
+            {
+            "company": "TechCorp",
+            "role": "</llmlingua><llmlingua, rate=0.5>Senior Software Engineer</llmlingua><llmlingua, compress=False>",
+            "description": "</llmlingua><llmlingua, rate=0.4>At TechCorp, John was responsible for leading a team of software engineers and overseeing the development of scalable web applications. He played a key role in driving the adoption of cloud technologies within the company, significantly enhancing the efficiency of their digital operations.</llmlingua><llmlingua, compress=False>"
+            },
+            {
+            "company": "Innovatech",
+            "role": "</llmlingua><llmlingua, rate=0.5>Lead Developer</llmlingua><llmlingua, compress=False>",
+            "description": "</llmlingua><llmlingua, rate=0.4>In his role at Innovatech, John focused on developing cutting-edge AI algorithms and implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making.</llmlingua><llmlingua, compress=False>"
+            }
+        ],
+        "skills": "</llmlingua><llmlingua, rate=0.4>Java, Python, Machine Learning, Cloud Computing, AI Development</llmlingua><llmlingua, compress=False>"
+    }</llmlingua>"""
+    JSON_COMPRESSED_PROMPT = """
+    {
+        "id": "987654",
+        "name": "John Doe",
+        "isActive": "true",
+        "biography": " Doe, born in York in 1985 a renowned engineer with over the field.John from MIT and has since worked with several.He has a for developing innovative solutions and has to numerous projectsJohn is also avid and speaker at conferences, his on and their the business.In his enjoys, reading fiction and playing piano.",
+        "employmentHistory": [
+            {
+            "company": "TechCorp",
+            "role": "Senior",
+            "description": " John was for leading of engineers and of scalable.He in the of cloud technologies, significantly the of their digital operations."},
+            {
+            "company": "Innovatech",
+            "role": "Lead",
+            "description": "In his John on developingedge AI and implementing learning solutions for various was in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."}
+        ],
+        "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
+    }"""
+    MEETINGBANK_TRANSCRIPT_0_PROMPT = "<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.2> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.6> Now. I had queued up to motion, but.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.6> Great that we have any public comment on this.</llmlingua><llmlingua, compress=False>\nSpeaker 5:</llmlingua><llmlingua, rate=0.6> If there are any members of the public that would like to speak on items 11, 12, 13, 14, 16 and 28 in person, please sign up at the podium in Zoom. Please use the raise hand feature or dial star nine now. Seen on the concludes public comment.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.3> Thank you. Please to a roll call vote, please.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilwoman Sanchez.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.3> I am.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilwoman Allen. I. Councilwoman Price.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilman Spooner, i. Councilwoman Mongo i. Councilwoman Sarah I. Councilmember Waronker I. Councilman Alston.</llmlingua><llmlingua, compress=False>\nSpeaker 1:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Vice Mayor Richardson.</llmlingua><llmlingua, compress=False>\nSpeaker 3:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> The motion is carried nine zero.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.05> Thank you. That concludes the consent. Just a couple announcements for the regular agenda. So we do have a very long and full agenda today. We have the budget hearing, which will happen first and then right after the budget hearing. We have a variety of other hearings as it relate to the our local control program and sales tax agreement. And then we have we're going to go right into some issues around and bonds around the aquarium and also the second reading of the health care worker ordinance, which we're going to try to do all of that towards the beginning of the agenda. And then we have a long agenda for the rest of of the council. So I just want to warn folks that we do have a we do have a long meeting. We're going to go right into the budget hearings. That's the first thing on the agenda. And they're going to try to move through that, through the council as expeditiously as possible. And so with that, let's continue the budget hearing, which we are doing for fire, police and parks. We're going to hear all of the presentations at once. And then after we go through all the presentations, we'll do all the all of the questions at once and then any any public comment, and we'll go from there.</llmlingua>"
+    COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT = '\n    {\n        "id": "987654",\n        "name": "John Doe",\n        "isActive": "true",\n        "biography": " Doe, born in York in a renowned engineer over the field John from and has worked with several has a for developing has to numerous avid and speaker at, his on and the. In his enjoys, reading fiction and playing piano.",\n        "employmentHistory": [\n            {\n            "company": "TechCorp",\n            "role": "Senior",\n            "description": " John was for of engineers and of scalable He in the of cloud technologies, significantly the of their digital operations."\n            },\n            {\n            "company": "Innovatech",\n            "role": "Lead",\n            "description": "In John on developingedge AI and implementing learning for various He was developing a analytics that the\'s to datadriven making."\n            }\n        ],\n        "skills": ",,, AI Development"\n    }\n\nSpeaker 4: you. And we do the for content? Items I are 11,,, and, believe.\nSpeaker 0: a communication from on Price to increase the fund the City Manager a the the is Councilman Super. the special provide the of summer.man. increase fund manager a the Jazz theman Allen, provide toa, Sew Feria, of and Item communication. from Mayor Member Mur to Ron Palmer. and Academic\nSpeaker 4: We have a promotion and a second time asman servedman Ringa and customers and they have any comments.\nSpeaker 2: Now. I had queued up to motion, but.\nSpeaker 4: Great that we have any public comment on this.\nSpeaker 5: If there any members the public that to speak on items 11,, 16 and 28 in person please sign up at the podium in Zoom. Please use the raise hand feature or dial star nine. Seen on the concludes public comment\nSpeaker 4:. Please to a\nSpeaker 0:woman\nSpeaker 2:\nSpeaker 0:woman.woman\nSpeaker 2:\nSpeaker 0:man,woman Mongowomanmemberman\nSpeaker 1:\nSpeaker 0: Mayor\nSpeaker 3:\nSpeaker 0: The is carried nine\nSpeaker 4: the and the the budget hearings. That\'s the first thing on the agenda. And they\'re going to try to move through that, through the council as expeditiously as possible. And so with that, let\'s continue the budget hearing, which we are doing for fire, police and parks. We\'re going to hear all of the presentations at once. And then after we go through all the presentations, we\'ll do all the all of the questions at once and then any any public comment, and we\'ll go from there.'
 
     def __init__(self, *args, **kwargs):
         super(LLMLinguaTester, self).__init__(*args, **kwargs)
+        try:
+            import nltk
+            nltk.download('punkt')
+        except:
+            print('nltk_data exits.')
         self.llmlingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
 
     def test_general_compress_prompt(self):
@@ -42,3 +87,43 @@ def test_general_compress_prompt(self):
         self.assertEqual(compressed_prompt["compressed_tokens"], 206)
         self.assertEqual(compressed_prompt["ratio"], "3.5x")
         self.assertEqual(compressed_prompt["rate"], "28.3%")
+
+    def test_general_structured_compress_prompt(self):
+        # Single Stuctured Context
+        import json
+
+        context, _, _, _ = self.llmlingua.segment_structured_context(
+            [self.JSON_PROMPT], 0.5
+        )
+        _ = json.loads(context[0])
+        compressed_prompt = self.llmlingua.structured_compress_prompt(
+            [self.JSON_PROMPT],
+            rate=0.5,
+            use_sentence_level_filter=True,
+            use_token_level_filter=True,
+        )
+        _ = json.loads(compressed_prompt["compressed_prompt"])
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.JSON_COMPRESSED_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 318)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 225)
+        self.assertEqual(compressed_prompt["ratio"], "1.4x")
+        self.assertEqual(compressed_prompt["rate"], "70.8%")
+
+        # Multiple Stuctured Context
+        compressed_prompt = self.llmlingua.structured_compress_prompt(
+            [self.JSON_PROMPT, self.MEETINGBANK_TRANSCRIPT_0_PROMPT],
+            rate=0.5,
+            use_sentence_level_filter=False,
+            use_token_level_filter=True,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 1130)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 567)
+        self.assertEqual(compressed_prompt["ratio"], "2.0x")
+        self.assertEqual(compressed_prompt["rate"], "50.2%")
diff --git a/tests/test_longllmlingua.py b/tests/test_longllmlingua.py
index 6dccd5f..27b6005 100644
--- a/tests/test_longllmlingua.py
+++ b/tests/test_longllmlingua.py
@@ -12,9 +12,57 @@ class LongLLMLinguaTester(unittest.TestCase):
     GSM8K_250TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie want plan many over the together for their next week chapters of to study worksheets memorize They out they should hours each chapter 1. hours each worksheet plan study no each day, many plan to total take a 10minute break every hour 10- snack breaks each day, 30 minutes each day?Let think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of 2 chapters, x chapters hours total.For worksheets plan to 1.5 for works,.5 hours 4 worksheets hours total and Melanie need with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
     GSM8K_250TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "Question: You or 1 watermelon the price You bought 36 evenly split between oranges, watermelons, and price 1 orange is $0.50 How apple if total bill was $?Let's think step by stepIf were evenly split 3 types of fruits, 36/ = 12 units of fruit\nIf 1 orange $0. then will cost $0.50 * 126\nIf my total bill was $ and I spent $6 on oranges then I spent $66 - $6 = $ the other 2 fruit types.\nAssuming price is W knowing you 4 apples the same price that price one is A, 1=4A\n we we bought watermelons and apples for $60 we know that $ 12W + 12A\nKnowing that 1W=4A, we can convert the above to $ 12(4A) + 12A$60 = 48A + 12A\n$60 = 60\nThen know price of one apple (A) is $60/60= $\nThe answer is 1\n\nQuestiony while.  After three weeks, how many social media followers did the girl with the most total followers have?\nLet's think step by step\nAfter one week, Susy has 100+40 = 140 followers.\nIn the second week, Susy gains 40/2 = 20 new followers.\nIn the third week, Susy gains 20/2 = 10 new followers.\nIn total, Susy finishes the three weeks with 140+20+10 = 170 total followers.\nAfter one week, Sarah has 50+90 = 140 followers.\nAfter the second week, Sarah gains 90/3 = 30 followers.\nAfter the third week, Sarah gains 30/3 = 10 followers.\nSo, Sarah finishes the three weeks with 140+30+10 = 180 total followers.\nThus, Sarah is the girl with the most total followers with a total of 180.\nThe answer is 180\n\nQuestion: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
     QUESTION = "Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?"
+    JSON_PROMPT = """<llmlingua, compress=False>
+    {
+        "id": "</llmlingua><llmlingua, rate=0.8>987654</llmlingua><llmlingua, compress=False>",
+        "name": "</llmlingua><llmlingua, rate=0.8>John Doe</llmlingua><llmlingua, compress=False>",
+        "isActive": "</llmlingua><llmlingua, rate=0.8>true</llmlingua><llmlingua, compress=False>",
+        "biography": "</llmlingua><llmlingua, rate=0.4>John Doe, born in New York in 1985, is a renowned software engineer with over 10 years of experience in the field. John graduated from MIT with a degree in Computer Science and has since worked with several Fortune 500 companies. He has a passion for developing innovative software solutions and has contributed to numerous open source projects. John is also an avid writer and speaker at tech conferences, sharing his insights on emerging technologies and their impact on the business world. In his free time, John enjoys hiking, reading science fiction novels, and playing the piano.</llmlingua><llmlingua, compress=False>",
+        "employmentHistory": [
+            {
+            "company": "TechCorp",
+            "role": "</llmlingua><llmlingua, rate=0.5>Senior Software Engineer</llmlingua><llmlingua, compress=False>",
+            "description": "</llmlingua><llmlingua, rate=0.4>At TechCorp, John was responsible for leading a team of software engineers and overseeing the development of scalable web applications. He played a key role in driving the adoption of cloud technologies within the company, significantly enhancing the efficiency of their digital operations.</llmlingua><llmlingua, compress=False>"
+            },
+            {
+            "company": "Innovatech",
+            "role": "</llmlingua><llmlingua, rate=0.5>Lead Developer</llmlingua><llmlingua, compress=False>",
+            "description": "</llmlingua><llmlingua, rate=0.4>In his role at Innovatech, John focused on developing cutting-edge AI algorithms and implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making.</llmlingua><llmlingua, compress=False>"
+            }
+        ],
+        "skills": "</llmlingua><llmlingua, rate=0.4>Java, Python, Machine Learning, Cloud Computing, AI Development</llmlingua><llmlingua, compress=False>"
+    }</llmlingua>"""
+    JSON_COMPRESSED_PROMPT = """
+    {
+        "id": "765",
+        "name": " Doe",
+        "isActive": "true",
+        "biography": "John York is a software years experience in the. MIT with degree Science and has worked with several Fortune companies He developing solutions and projects is and speaker tech conferences emerging business world. In his time, enjoys, reading, playing.",
+        "employmentHistory": [
+            {
+            "company": "TechCorp",
+            "role": "",
+            "description": " Tech, was leading software engineers overseeing web played key the cloud technologies within the company, significantly enhancing their digital."
+            },
+            {
+            "company": "Innovatech",
+            "role": "",
+            "description": " role developingedge AI implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."
+            }
+        ],
+        "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
+    }"""
+    MEETINGBANK_TRANSCRIPT_0_PROMPT = "<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.2> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.6> Now. I had queued up to motion, but.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.6> Great that we have any public comment on this.</llmlingua><llmlingua, compress=False>\nSpeaker 5:</llmlingua><llmlingua, rate=0.6> If there are any members of the public that would like to speak on items 11, 12, 13, 14, 16 and 28 in person, please sign up at the podium in Zoom. Please use the raise hand feature or dial star nine now. Seen on the concludes public comment.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.3> Thank you. Please to a roll call vote, please.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilwoman Sanchez.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.3> I am.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilwoman Allen. I. Councilwoman Price.</llmlingua><llmlingua, compress=False>\nSpeaker 2:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Councilman Spooner, i. Councilwoman Mongo i. Councilwoman Sarah I. Councilmember Waronker I. Councilman Alston.</llmlingua><llmlingua, compress=False>\nSpeaker 1:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> Vice Mayor Richardson.</llmlingua><llmlingua, compress=False>\nSpeaker 3:</llmlingua><llmlingua, rate=0.3> I.</llmlingua><llmlingua, compress=False>\nSpeaker 0:</llmlingua><llmlingua, rate=0.3> The motion is carried nine zero.</llmlingua><llmlingua, compress=False>\nSpeaker 4:</llmlingua><llmlingua, rate=0.05> Thank you. That concludes the consent. Just a couple announcements for the regular agenda. So we do have a very long and full agenda today. We have the budget hearing, which will happen first and then right after the budget hearing. We have a variety of other hearings as it relate to the our local control program and sales tax agreement. And then we have we're going to go right into some issues around and bonds around the aquarium and also the second reading of the health care worker ordinance, which we're going to try to do all of that towards the beginning of the agenda. And then we have a long agenda for the rest of of the council. So I just want to warn folks that we do have a we do have a long meeting. We're going to go right into the budget hearings. That's the first thing on the agenda. And they're going to try to move through that, through the council as expeditiously as possible. And so with that, let's continue the budget hearing, which we are doing for fire, police and parks. We're going to hear all of the presentations at once. And then after we go through all the presentations, we'll do all the all of the questions at once and then any any public comment, and we'll go from there.</llmlingua>"
+    COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT = '\n    {\n        "id": "765",\n        "name": " Doe",\n        "isActive": "true",\n        "biography": "John York is a software years experience in the. MIT with degree Science and has worked with several Fortune companies He developing solutions and projects is and speaker tech conferences emerging business world. In his time, enjoys, reading, playing.",\n        "employmentHistory": [\n            {\n            "company": "TechCorp",\n            "role": "",\n            "description": " Tech, was leading software engineers overseeing web played the cloud technologies within the company, significantly enhancing their digital."\n            },\n            {\n            "company": "Innovatech",\n            "role": "",\n            "description": " role developingedge AI implementing for was developing a analytics that transformed thedriven decision."\n            }\n        ],\n        "skills": "Java,,, Development"\n    }\n\nSpeaker 4: Thank you. And do the functions for content? I believe 11, 14, 16 and 28 I believe.\nSpeaker 0: 11 a recommendation in the provide the is Superation for. 13 aation increase manager the $man to Cas Kore, Beachen. Item communication from Mayor Council.\nSpeaker 4: We have and councilman Ringa and customers and they have any comments.\nSpeaker 2:. to motion,\nSpeaker 4: Great we have any public comment on this\nSpeaker 5: If there of the that like speak on 11 12 and 28 person, sign up at the podium Zoom Please use hand or star Seen on the comment.\nSpeaker 4: Please to please\nSpeaker 0: Councilwoman Sanchez.\nSpeaker 2: I am\nSpeaker 0:woman.woman.\nSpeaker 2: I\nSpeaker 0: Councilman i Mongo Councilston\nSpeaker 1: I.\nSpeaker 0: Vice Mayor\nSpeaker 3: I\nSpeaker 0: zero\nSpeaker 4: the budget hearings. That\'s the first thing on the agenda. And they\'re going to try to move through that, through the council as expeditiously as possible. And so with that, let\'s continue the budget hearing, which we are doing for fire, police and parks. We\'re going to hear all of the presentations at once. And then after we go through all the presentations, we\'ll do all the all of the questions at once and then any any public comment, and we\'ll go from there.'
+    STRUCTURED_QUESTION = "Question: What is the main idea of these materials?"
 
     def __init__(self, *args, **kwargs):
         super(LongLLMLinguaTester, self).__init__(*args, **kwargs)
+        try:
+            import nltk
+            nltk.download('punkt')
+        except:
+            print('nltk_data exits.')
         self.llmlingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
 
     def test_general_compress_prompt(self):
@@ -59,3 +107,57 @@ def test_general_compress_prompt(self):
         self.assertEqual(compressed_prompt["compressed_tokens"], 474)
         self.assertEqual(compressed_prompt["ratio"], "2.3x")
         self.assertEqual(compressed_prompt["rate"], "43.3%")
+
+    def test_general_structured_compress_prompt(self):
+        # Single Stuctured Context
+        import json
+
+        context, _, _, _ = self.llmlingua.segment_structured_context(
+            [self.JSON_PROMPT], 0.5
+        )
+        _ = json.loads(context[0])
+        compressed_prompt = self.llmlingua.structured_compress_prompt(
+            [self.JSON_PROMPT],
+            question=self.STRUCTURED_QUESTION,
+            target_token=150,
+            use_sentence_level_filter=False,
+            condition_in_question="after_condition",
+            reorder_context="sort",
+            dynamic_context_compression_ratio=0.4,
+            condition_compare=True,
+            context_budget="+100",
+            rank_method="longllmlingua",
+            concate_question=False,
+        )
+        _ = json.loads(compressed_prompt["compressed_prompt"])
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.JSON_COMPRESSED_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 329)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 205)
+        self.assertEqual(compressed_prompt["ratio"], "1.6x")
+        self.assertEqual(compressed_prompt["rate"], "62.3%")
+
+        # Multiple Stuctured Context
+        compressed_prompt = self.llmlingua.structured_compress_prompt(
+            [self.JSON_PROMPT, self.MEETINGBANK_TRANSCRIPT_0_PROMPT],
+            question=self.STRUCTURED_QUESTION,
+            target_token=650,
+            use_sentence_level_filter=False,
+            condition_in_question="after_condition",
+            reorder_context="sort",
+            dynamic_context_compression_ratio=0.4,
+            condition_compare=True,
+            context_budget="+100",
+            rank_method="longllmlingua",
+            concate_question=False,
+        )
+        self.assertEqual(
+            compressed_prompt["compressed_prompt"],
+            self.COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT,
+        )
+        self.assertEqual(compressed_prompt["origin_tokens"], 1141)
+        self.assertEqual(compressed_prompt["compressed_tokens"], 504)
+        self.assertEqual(compressed_prompt["ratio"], "2.3x")
+        self.assertEqual(compressed_prompt["rate"], "44.2%")