diff --git a/DOCUMENT.md b/DOCUMENT.md index fb0431b..1ff0b7b 100644 --- a/DOCUMENT.md +++ b/DOCUMENT.md @@ -31,7 +31,7 @@ llm_lingua = PromptCompressor("microsoft/phi-2") llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"}) ``` -To try **LongLLMLingua** in your scenorias, you can use +To try **LongLLMLingua** in your scenarios, you can use ```python from llmlingua import PromptCompressor @@ -51,7 +51,7 @@ compressed_prompt = llm_lingua.compress_prompt( ) ``` -To try **LLMLingua-2** in your scenorias, you can use +To try **LLMLingua-2** in your scenarios, you can use ```python from llmlingua import PromptCompressor diff --git a/README.md b/README.md index 9ac0b61..824310e 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ llm_lingua = PromptCompressor("microsoft/phi-2") llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"}) ``` -To try **LongLLMLingua** in your scenorias, you can use +To try **LongLLMLingua** in your scenarios, you can use ```python from llmlingua import PromptCompressor @@ -168,7 +168,7 @@ compressed_prompt = llm_lingua.compress_prompt( ) ``` -To try **LLMLingua-2** in your scenorias, you can use +To try **LLMLingua-2** in your scenarios, you can use ```python from llmlingua import PromptCompressor diff --git a/experiments/llmlingua2/README.md b/experiments/llmlingua2/README.md index 8ba4f62..96c966a 100644 --- a/experiments/llmlingua2/README.md +++ b/experiments/llmlingua2/README.md @@ -24,7 +24,7 @@ pip install tensorboard ## Data collection -We will release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/LLMLingua-2-data-MeetingBankComp) after review. We also provide the whole data collection pipeline at [**collect_data.sh**](data_collection/collect_data.sh) to help you construct your custom compression dataset. +We release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/MeetingBank-LLMCompressed) after review. We also provide the whole data collection pipeline at [**collect_data.sh**](data_collection/collect_data.sh) to help you construct your custom compression dataset. ## Model Training diff --git a/experiments/llmlingua2/data_collection/README.md b/experiments/llmlingua2/data_collection/README.md index 8083e11..8759740 100644 --- a/experiments/llmlingua2/data_collection/README.md +++ b/experiments/llmlingua2/data_collection/README.md @@ -1,10 +1,10 @@ ### Use our collected data -We will release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/LLMLingua-2-data-MeetingBankComp) after review. To load data, simply use +We release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/MeetingBank-LLMCompressed) after review. To load data, simply use ```python from datasets import load_dataset -data = load_dataset("microsoft/LLMLingua-2-data-MeetingBankComp", split="train") +data = load_dataset("microsoft/MeetingBank-LLMCompressed", split="train") print(len(data)) for idx, sample in enumerate(data): # concatenation of all chunks @@ -16,7 +16,7 @@ for idx, sample in enumerate(data): To load compressed chunks along with original chunks, simply use ```python from datasets import load_dataset -data = load_dataset("microsoft/LLMLingua-2-data-MeetingBankComp", split="train") +data = load_dataset("microsoft/MeetingBank-LLMCompressed", split="train") print(len(data)) for idx, sample in enumerate(data): # chunk list diff --git a/experiments/llmlingua2/data_collection/collect_data.sh b/experiments/llmlingua2/data_collection/collect_data.sh index acd58f8..21963bf 100644 --- a/experiments/llmlingua2/data_collection/collect_data.sh +++ b/experiments/llmlingua2/data_collection/collect_data.sh @@ -1,16 +1,16 @@ # Copyright (c) 2023 Microsoft # Licensed under The MIT License [see LICENSE for details] -python format_data.py +# python format_data.py -python compress.py --load_origin_from ../../../results/meetingbank/origin/meetingbank_train_formated.json \ - --compressor gpt4 \ - --chunk_size 512 \ - --save_path ../../../results/meetingbank/gpt-4-32k_comp/compression_cs512_meetingbank_train_formated.json +# python compress.py --load_origin_from ../results/meetingbank/origin/meetingbank_train_formated.json \ +# --compressor gpt4 \ +# --chunk_size 512 \ +# --save_path ../results/meetingbank/gpt-4-32k_comp/compression_cs512_meetingbank_train_formated.json -python label_word.py --load_prompt_from ../../../results/meetingbank/gpt-4-32k_comp/compression_cs512_meetingbank_train_formated.json \ +python label_word.py --load_prompt_from microsoft/MeetingBank-LLMCompressed \ --window_size 400 \ - --save_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.json + --save_path ../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.json -python filter.py --load_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt \ - --save_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt +python filter.py --load_path ../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt \ + --save_path ../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt diff --git a/experiments/llmlingua2/data_collection/label_word.py b/experiments/llmlingua2/data_collection/label_word.py index 5593b78..25e82a6 100644 --- a/experiments/llmlingua2/data_collection/label_word.py +++ b/experiments/llmlingua2/data_collection/label_word.py @@ -6,7 +6,7 @@ import logging import os from collections import defaultdict - +from datasets import load_dataset import spacy import torch from tqdm import tqdm @@ -58,16 +58,15 @@ def split_string(input_string, ignore_tokens=set([","])): def is_equal(token1, token2): return token1.lower() == token2.lower() - origins, comps = [], [] -raw_data = json.load(open(args.load_prompt_from, "r")) -for sid, sample in raw_data.items(): +meeting_bank_comp = load_dataset(args.load_prompt_from, split="train") +for i, sample in enumerate(meeting_bank_comp): if len(sample["prompt_list"]) != len(sample["compressed_prompt_list"]): - print(f"{sid}-th length not equal") + print(f"{i}-th length not equal") continue origins.extend(sample["prompt_list"]) comps.extend(sample["compressed_prompt_list"]) - + res = {} res_pt = defaultdict(list) diff --git a/experiments/llmlingua2/model_training/train.sh b/experiments/llmlingua2/model_training/train.sh index e1ed311..1a3f615 100644 --- a/experiments/llmlingua2/model_training/train.sh +++ b/experiments/llmlingua2/model_training/train.sh @@ -1,5 +1,5 @@ # Copyright (c) 2023 Microsoft # Licensed under The MIT License [see LICENSE for details] -python train_roberta.py --data_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt \ +python train_roberta.py --data_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt \ --save_path ../../../results/models/xlm_roberta_large_meetingbank_only.pth