From 089f64753833a7ac1bafaa33da47a6a96a52cfdc Mon Sep 17 00:00:00 2001
From: Ligeng Zhu <Lyken17@users.noreply.github.com>
Date: Mon, 26 Aug 2024 23:21:25 +0800
Subject: [PATCH] Support nvscope  (#132)

Co-authored-by: Zhijian Liu <5782437+zhijian-liu@users.noreply.github.com>
---
 CIs/nvscope.py                              | 92 +++++++++++++++++++++
 llava/data/dataset_impl/coyo_recap.py       |  3 +
 llava/data/dataset_impl/general_img_text.py |  2 +
 llava/data/dataset_impl/hiertext.py         |  2 +
 llava/data/dataset_impl/panda70m.py         |  2 +
 llava/data/dataset_impl/sam.py              |  2 +
 llava/data/datasets_mixture.py              |  4 +-
 llava/data/simple_video_dataset.py          | 21 +++--
 llava/data/simple_vila_webdataset.py        |  2 +-
 llava/data_aug/srun_reformat.sh             |  2 +-
 llava/data_aug/video_inference.py           |  2 +-
 pyproject.toml                              |  2 +-
 12 files changed, 122 insertions(+), 14 deletions(-)
 create mode 100644 CIs/nvscope.py

diff --git a/CIs/nvscope.py b/CIs/nvscope.py
new file mode 100644
index 00000000..4d6ea08f
--- /dev/null
+++ b/CIs/nvscope.py
@@ -0,0 +1,92 @@
+import os
+import os.path as osp
+import re
+import subprocess
+import sys
+
+r_on = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*on[\t\s]*""")
+r_off = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*off[\t\s]*""")
+
+"""
+find all files btw #nvcode:on and #nvcode:off
+"""
+
+
+def filter_nvcode(file_path):
+    with open(file_path) as f:
+        lines = f.readlines()
+
+    new_lines = []
+    new_idx = []
+    skip_flag = False
+    for i, line in enumerate(lines):
+        if r_off.findall(line):
+            skip_flag = False
+            continue
+
+        if skip_flag:
+            continue
+
+        if r_on.findall(line):
+            skip_flag = True
+            continue
+        new_lines.append(line)
+        new_idx.append(i)
+
+    return lines, new_idx, new_lines
+
+
+def iterate_py_files(directory):
+    py_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith((".py", ".sh")):
+                py_files.append(os.path.join(root, file))
+    return py_files
+
+
+def get_authors(file_path):
+    cmd = f"git log --format='%aN' '{file_path}' | sort -u"
+    output = subprocess.check_output(cmd, shell=True, stderr=None).decode("utf-8")
+    authors = output.strip().split("\n")
+    return authors
+
+
+def check_rule(line):
+    if "/lustre/fsw" in line or "/home" in line:
+        return True
+
+    if "nvr_elm_llm" in line or "llmservice" in line or "cosmos_misc":
+        return True
+
+    return False
+
+
+def check_file_confidential_info(fpath):
+    lines, new_idx, new_lines = filter_nvcode(fpath)
+
+    pass_flag = True
+    for idx, (_idx, line) in enumerate(zip(new_idx, new_lines)):
+        if "/lustre/fsw" in line or "/home" in line:
+            authors = get_authors(fpath)
+            print(f"{fpath} --- Line: {_idx} --- authors: {authors}")
+            print("\t", line.strip())
+            pass_flag = False
+
+    return pass_flag
+
+
+def main(fpath="llava/data/datasets_mixture.py"):
+    if osp.isdir(fpath):
+        py_files = iterate_py_files(fpath)
+        for fpath in py_files:
+            res = check_file_confidential_info(fpath)
+    elif osp.isfile(fpath):
+        res = check_file_confidential_info(fpath)
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
+# print("".join(res))
diff --git a/llava/data/dataset_impl/coyo_recap.py b/llava/data/dataset_impl/coyo_recap.py
index ca1d58c1..0fb70fe6 100755
--- a/llava/data/dataset_impl/coyo_recap.py
+++ b/llava/data/dataset_impl/coyo_recap.py
@@ -88,7 +88,10 @@ def __init__(
             n_samples_per_idx=n_samples_per_idx,
         )
         if getattr(data_args, "caption_choice", None) is None:
+            self.caption_choice = "~/workspace/coyo-25m-recap"
+            # nvcode: on
             self.caption_choice = "/home/ligengz/workspace/coyo-25m-recap"
+            # nvcode: off
         else:
             self.caption_choice = data_args.caption_choice
         print(f"Current caption choice: {self.caption_choice}.")
diff --git a/llava/data/dataset_impl/general_img_text.py b/llava/data/dataset_impl/general_img_text.py
index 278eca1b..3571b85f 100644
--- a/llava/data/dataset_impl/general_img_text.py
+++ b/llava/data/dataset_impl/general_img_text.py
@@ -215,7 +215,9 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 
 
 if __name__ == "__main__":
+    # nvcode: on
     data_path = "/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/data"
+    # nvcode: off
     dst = VILAWebDataset(
         data_path=osp.abspath(data_path),
     )
diff --git a/llava/data/dataset_impl/hiertext.py b/llava/data/dataset_impl/hiertext.py
index 5cb46fa5..debd15d4 100755
--- a/llava/data/dataset_impl/hiertext.py
+++ b/llava/data/dataset_impl/hiertext.py
@@ -54,7 +54,9 @@
 from llava.data.dataset_impl.textocr import GenericDataset, preprocess_OCR
 from llava.train.args import DataArguments, TrainingArguments
 
+# nvcode: on
 DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/hiertext"
+# nvcode: off
 
 
 def bbx_from_vertices_list(vertices):
diff --git a/llava/data/dataset_impl/panda70m.py b/llava/data/dataset_impl/panda70m.py
index 1bbc07ae..c46cb0bb 100755
--- a/llava/data/dataset_impl/panda70m.py
+++ b/llava/data/dataset_impl/panda70m.py
@@ -62,8 +62,10 @@
 from llava.mm_utils import is_gemma_tokenizer, tokenizer_image_token
 from llava.train.args import DataArguments, TrainingArguments
 
+# nvcode: on
 # DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m"
 # SPLIT = "panda70m_testing"
+# nvcode: off
 
 
 def str2time(s):
diff --git a/llava/data/dataset_impl/sam.py b/llava/data/dataset_impl/sam.py
index 6fdf587c..46a1d1a2 100755
--- a/llava/data/dataset_impl/sam.py
+++ b/llava/data/dataset_impl/sam.py
@@ -227,6 +227,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
         return dict(input_ids=input_ids, labels=targets, image=image_list)
 
 
+# nvcode: on
 if __name__ == "__main__":
     data_path = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-reformat"
     dst = VILAWebDataset(
@@ -235,3 +236,4 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
     # print(dst[0])
     for idx, data in enumerate(dst):
         print(idx, data.keys())
+# nvcode: off
diff --git a/llava/data/datasets_mixture.py b/llava/data/datasets_mixture.py
index 3035c584..2fa4c5b6 100755
--- a/llava/data/datasets_mixture.py
+++ b/llava/data/datasets_mixture.py
@@ -53,6 +53,7 @@ def add_dataset(dataset):
 
 
 def register_datasets_mixtures():
+    # nvcode: on
     nvclip_5m_vfc_recap30 = Dataset(
         dataset_name="nvclip_5m_vfc_recap30",
         dataset_type="imgtxt-wds",
@@ -61,9 +62,7 @@ def register_datasets_mixtures():
         end_idx=30 / 100,
         caption_choice="/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/vfc_longcaption_jsonraw",
     )
-
     add_dataset(nvclip_5m_vfc_recap30)
-
     nvclip_5m_vfc_recap70 = Dataset(
         dataset_name="nvclip_5m_vfc_recap70",
         dataset_type="imgtxt-wds",
@@ -1551,3 +1550,4 @@ def register_datasets_mixtures():
         image_path="/mnt/amlfs-01/home/fuzhaox/video_datasets_v2/sherlock/sherlock/images",
     )
     add_dataset(osmo_sherlock)
+    # nvcode: off
diff --git a/llava/data/simple_video_dataset.py b/llava/data/simple_video_dataset.py
index ef3a2758..eb7355d7 100755
--- a/llava/data/simple_video_dataset.py
+++ b/llava/data/simple_video_dataset.py
@@ -44,9 +44,11 @@ def load_tarfile(tar_path):
     return tarfile.open(tar_path)
 
 
+# nvcode: on
 # INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video"
 INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m"
 CACHEDIR = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m-webds-meta"
+# nvcode: off
 
 
 def process_tarfile(tar_abspath, tar_meta_path, cache_dir):
@@ -102,15 +104,20 @@ def process_tarfile(tar_abspath, tar_meta_path, cache_dir):
 class SimpleVideoDataset(torch.utils.data.Dataset):
     def __init__(
         self,
-        data_path="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt",
-        # cache_dir="/home/ligengz/.cache/simplecoyo",
-        # cache_dir="/lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/datasets/captioning/vila-webds-meta",
-        cache_dir="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta",
+        data_path=None,  # required
+        cache_dir=None,  # required
         meta_path=None,
-        # image_load_mode="pil",  # pil / rawbytes / fpath,
         max_shards_to_load=None,
         overwrite=False,
     ):
+        # nvcode: on
+        data_path = (
+            "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt",
+        )
+        cache_dir = (
+            "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta",
+        )
+        # nvcode: off
         self.data_path = data_path
         self.meta_path = meta_path
         if meta_path is None:
@@ -189,9 +196,7 @@ def __init__(
         print(f"User name: {user_name}")
         self.dataset = ShardListDataset(
             self.meta_path,
-            cache_dir=osp.expanduser(
-                f"/lustre/fsw/portfolios/nvr/users/{user_name}/cache/_wids_cache/{getpass.getuser()}-{uuid}"
-            ),
+            cache_dir=osp.expanduser(f"~/.cache/_wids_cache/{getpass.getuser()}-{uuid}"),
         )
 
     def __getitem__(self, idx):
diff --git a/llava/data/simple_vila_webdataset.py b/llava/data/simple_vila_webdataset.py
index 60b41dda..675715f2 100755
--- a/llava/data/simple_vila_webdataset.py
+++ b/llava/data/simple_vila_webdataset.py
@@ -192,7 +192,7 @@ def __init__(
         self,
         data_path=COYO_25M_VILA,
         meta_path=None,
-        cache_dir="/home/ligengz/datasets/vila-webds-meta",
+        cache_dir=osp.join(osp.expanduser("~/.cache/vila/webds-meta")),
         max_shards_to_load=None,
     ):
         self.data_path = osp.expanduser(data_path)
diff --git a/llava/data_aug/srun_reformat.sh b/llava/data_aug/srun_reformat.sh
index 5dd05047..16c75ebb 100755
--- a/llava/data_aug/srun_reformat.sh
+++ b/llava/data_aug/srun_reformat.sh
@@ -1,7 +1,7 @@
+# nvcode: on
 JOBS_LIMIT=100  # Set your limit here
 ACCOUNT=${ACCOUNT:-llmservice_nlp_fm}
 PARTITION=${PARTITION:-cpu,cpu_long} #draco: cpu,cpu_long,batch_singlenode,grizzly,polar
-
 src_folder=/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-raw
 
 for f in $src_folder/*.tar; do
diff --git a/llava/data_aug/video_inference.py b/llava/data_aug/video_inference.py
index 6eb25c4b..17727838 100755
--- a/llava/data_aug/video_inference.py
+++ b/llava/data_aug/video_inference.py
@@ -116,7 +116,7 @@ def eval_model(args):
     model_name = get_model_name_from_path(model_path)
 
     print(model_path)
-    video_list = list(glob.glob(osp.expanduser(osp.join(args.video_dir, "*.mp4"))))
+    video_list = list(glob.glob(osp.join(args.video_dir, "**/*.mp4")))
     assert len(video_list) > 0, f"no video found in {args.video_dir}"
 
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_name, args.model_base)
diff --git a/pyproject.toml b/pyproject.toml
index 223a9c07..507f8820 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ eval = ["word2number", "Levenshtein", "nltk", "pywsd"]
 
 [project.urls]
 "Homepage" = "https://hanlab.mit.edu/projects/vila"
-"Bug Tracker" = "https://github.com/Efficient-Large-Model/VILA-Internal/issues"
+"Bug Tracker" = "https://github.com/NVlabs/VILA/issues"
 
 [tool.black]
 line-length = 120