WING-NUS · dyxohjl666 · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "SciAssist"
-version = "0.0.37"
+version = "0.1.4"
 authors = [
   { name="WING-NUS", email="[email protected]" },
 ]
@@ -23,30 +23,31 @@ classifiers = [
 dependencies = [
     "beautifulsoup4~=4.9.0",
     "chardet~=3.0.4",
-    "datasets~=2.2.2",
+    "datasets~=2.15.0",
     "hydra-core>=1.1.0",
     "lxml",
     "matplotlib~=3.5.1",
     "nltk~=3.7",
-    "numpy~=1.19.2",
+    "numpy",
     "omegaconf~=2.2.2",
     "PyPDF2~=2.10.7",
     "python_magic~=0.4.18",
-    "pytorch_lightning~=1.7.1",
-    "requests~=2.21.0",
+    "pytorch_lightning~=2.0.4",
+    "requests~=2.22.0",
     "rich~=12.4.4",
     "seaborn~=0.11.2",
     "setuptools>=61.0",
     "torch>=1.12.0",
-    "torchmetrics>=0.7.0",
-    "transformers~=4.19.2",
+    "torchmetrics==0.11.4",
+    "transformers~=4.30.2",
     "wandb~=0.12.19",
     "pdfminer.six",
     "pandas~=1.4.3",
     "pytorch-crf",
     "torchcrf",
     "sacremoses",
     "seqeval",
+    "pytest~=7.4.3"
 ]
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 # --------- pytorch --------- #
 torch>=1.10.0
 torchvision>=0.11.0
-pytorch-lightning~=1.7.1
+pytorch-lightning~=2.0.4
 torchmetrics>=0.7.0
 
 # --------- hydra --------- #
@@ -28,15 +28,15 @@ pyrootutils
 python-dotenv~=0.20.0
 protobuf~=3.19.0
 rich~=12.4.4
-pytest~=7.1.2
+pytest~=7.4.3
 # sh~=1.14.2
 pudb            # debugger
 
 seaborn~=0.11.2
 omegaconf~=2.2.2
-transformers~=4.19.2
+transformers~=4.30.2
 packaging~=21.3
-datasets~=2.2.2
+datasets~=2.15.0
 beautifulsoup4~=4.9.0
 
 matplotlib~=3.5.1
@@ -46,7 +46,7 @@ pdfminer.six # windows pdf processing
 
 # --------- doc2json --------- #
 boto3~=1.9.147
-requests~=2.21.0
+requests~=2.22.0
 Flask~=1.0.2
 tqdm
 lxml

diff --git a/src/SciAssist/models/components/flant5_summarization.py b/src/SciAssist/models/components/flant5_summarization.py
@@ -28,15 +28,15 @@ def forward(self, input_ids=None, attention_mask=None, labels=None):
             logits=outputs.logits
         )
 
-    def generate(self, input_ids=None, attention_mask=None, num_beams=5, num_return_sequences=1):
+    def generate(self, input_ids=None, attention_mask=None, num_beams=1, num_return_sequences=1, top_k=0, max_length=500, do_sample=False):
         diversity_penalty = 0.0
         if num_return_sequences>1:
             diversity_penalty = 1.0
         return self.flant5.generate(input_ids=input_ids, attention_mask=attention_mask,
                                     num_beams=num_beams,
                                     num_return_sequences=num_return_sequences,
-                                    num_beam_groups=num_return_sequences,
-                                    diversity_penalty=diversity_penalty,
-                                    max_length=300,
-                                    do_sample=False,
-                                    no_repeat_ngram_size=5 )
+                                    diversity_penalty = diversity_penalty,
+                                    top_k=top_k,
+                                    max_length=max_length,
+                                    do_sample=do_sample,)
+
diff --git a/src/SciAssist/models/cora_module.py b/src/SciAssist/models/cora_module.py
@@ -11,6 +11,7 @@
 from torchmetrics import MaxMetric
 from torchmetrics.classification.accuracy import Accuracy
 
+
 from SciAssist.datamodules.components.cora_label import num_labels, LABEL_NAMES
 from SciAssist.models.components.bert_token_classifier import BertForTokenClassifier
 from SciAssist.utils.data_utils import DataUtilsForTokenClassification
@@ -67,7 +68,7 @@ def training_step(self, batch: Any, batch_idx: int):
         self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=False)
         return {"loss": loss}
 
-    def training_epoch_end(self, outputs: List[Any]):
+    def on_training_epoch_end(self):
         pass
 
     def validation_step(self, batch: Any, batch_idx: int):
@@ -91,7 +92,7 @@ def validation_step(self, batch: Any, batch_idx: int):
         self.log("val/macro_f1", macro_f1, on_step=False, on_epoch=True, prog_bar=True)
         return {"loss": loss, "preds": true_preds, "labels": true_labels}
 
-    def validation_epoch_end(self, outputs: List[Any]):
+    def on_validation_epoch_end(self):
         acc = self.val_acc.compute()
         self.val_acc_best.update(acc)
         self.log("val/acc_best", self.val_acc_best.compute(), on_epoch=True, prog_bar=True)
@@ -129,7 +130,7 @@ def test_step(self, batch: Any, batch_idx: int):
 
         return {"loss": loss, "preds": true_preds, "labels": true_labels}
 
-    def test_epoch_end(self, outputs: List[Any]):
+    def on_test_epoch_end(self):
         # wandb.init()
         acc = self.test_acc.compute()
         micro_f1 = self.test_micro_f1.compute()

diff --git a/src/SciAssist/models/mup_bart_module.py b/src/SciAssist/models/mup_bart_module.py
@@ -73,7 +73,7 @@ def training_step(self, batch: Any, batch_idx: int):
         self.log("train/loss", loss, on_step=True, on_epoch=True, prog_bar=False)
         return {"loss": loss}
 
-    def training_epoch_end(self, outputs: List[Any]):
+    def on_training_epoch_end(self):
         pass
 
     def validation_step(self, batch: Any, batch_idx: int):
@@ -121,7 +121,7 @@ def validation_step(self, batch: Any, batch_idx: int):
 
         return result
 
-    def validation_epoch_end(self, outputs: List[Any]):
+    def on_validation_epoch_end(self):
         rouge = self.val_metric.compute()
         # bert = self.val_bertscore.compute()
         # self.val_best_Rouge1.update(rouge["rouge1_fmeasure"])
@@ -219,22 +219,14 @@ def test_step(self, batch: Any, batch_idx: int):
         return result
 
 
-    def test_epoch_end(self, outputs: List[Any]):
+    def on_test_epoch_end(self):
         # Save prediction results
         # with open(os.path.join(self.model.model_dir,"prediction.txt"),'w') as f:
         #     for batch in outputs:
         #         for res in batch["preds"]:
         #             f.write(res)
         #             f.write("\n")
-
-        for batch in outputs:
-            for id,res in zip(batch['id'],batch["preds"]):
-                with open("/home/dingyx/project/SciAssist/data/pdfs/summary_flant5/"  + str(id.item()) +".txt","a") as f:
-                    # print("/home/dingyx/project/SciAssist/data/MUP_CTRLkeyword/" + str(id.item()) +".txt")
-                    f.write(res)
-                    f.write("\n")
-                    # f.write(str(len(res.split(" "))))
-
+
         P,R,F1 = bert_score.score(self.test_preds, self.test_labels,
                                             rescale_with_baseline=True, lang="en")
         # Compute average length of summaries
@@ -260,7 +252,6 @@ def test_epoch_end(self, outputs: List[Any]):
         self.log("test/gen_len", self.test_gen_len, on_step=False, on_epoch=True, prog_bar=True)
 
 
-
     def on_epoch_end(self):
         self.val_metric.reset()
         # self.val_bertscore.reset()

diff --git a/src/SciAssist/pipelines/__init__.py b/src/SciAssist/pipelines/__init__.py
@@ -1,8 +1,7 @@
 # main developer: Yixi Ding <[email protected]>
 
-from typing import Dict
-
 import torch
+from typing import Dict
 
 from SciAssist import BASE_CACHE_DIR
 from SciAssist.models.components.bert_dataset_extraction import BertForDatasetExtraction
@@ -47,7 +46,12 @@
             "model": FlanT5ForSummarization,
             "model_dict_url": "https://huggingface.co/spaces/dyxohjl666/Controlled-summarization/resolve/main/flant5-base-mup-scisumm-repeat5-kws.pt",
             "data_utils": DataUtilsForFlanT5,
-        }
+        },
+        "flan-t5-xl": {
+            "model": FlanT5ForSummarization,
+            "model_dict_url": None,
+            "data_utils": DataUtilsForFlanT5,
+        },
     },
     "dataset-extraction": {
         "default": {
@@ -59,7 +63,7 @@
 
 }
 
-def load_model(config: Dict, cache_dir=BASE_CACHE_DIR, device="gpu"):
+def load_model(config: Dict, checkpoint=None, cache_dir=BASE_CACHE_DIR, device="gpu"):
     '''
 
     Args:
@@ -77,7 +81,12 @@ def load_model(config: Dict, cache_dir=BASE_CACHE_DIR, device="gpu"):
 
     print("Loading the model...")
     model_class = config["model"]
-    model = model_class(cache_dir=cache_dir)
+
+    if checkpoint!=None:
+        model = model_class(cache_dir=cache_dir,model_checkpoint=checkpoint)
+    else:
+        model = model_class(cache_dir=cache_dir)
+
     map_location=None
     if device == "cpu":
         map_location = torch.device("cpu")

diff --git a/src/SciAssist/pipelines/pipeline.py b/src/SciAssist/pipelines/pipeline.py
@@ -27,7 +27,7 @@ class Pipeline():
 
     """
 
-    def __init__(self, task_name: str, model_name: str = "default", device="gpu",
+    def __init__(self, task_name: str, model_name: str = "default", checkpoint: str = None, device="gpu",
                  cache_dir=None, output_dir=None, temp_dir=None):
 
         self.device = device
@@ -37,7 +37,7 @@ def __init__(self, task_name: str, model_name: str = "default", device="gpu",
 
         self.config = TASKS[task_name][model_name]
         self.model_name = model_name
-        self.model = load_model(config=self.config, cache_dir=self.cache_dir, device=self.device)
+        self.model = load_model(config=self.config, checkpoint=checkpoint,cache_dir=self.cache_dir, device=self.device)
         if device in ["cuda", "gpu"] and torch.cuda.is_available():
             self.device = torch.device("cuda")
             self.model.cuda()