From 089f64753833a7ac1bafaa33da47a6a96a52cfdc Mon Sep 17 00:00:00 2001 From: Ligeng Zhu Date: Mon, 26 Aug 2024 23:21:25 +0800 Subject: [PATCH] Support nvscope (#132) Co-authored-by: Zhijian Liu <5782437+zhijian-liu@users.noreply.github.com> --- CIs/nvscope.py | 92 +++++++++++++++++++++ llava/data/dataset_impl/coyo_recap.py | 3 + llava/data/dataset_impl/general_img_text.py | 2 + llava/data/dataset_impl/hiertext.py | 2 + llava/data/dataset_impl/panda70m.py | 2 + llava/data/dataset_impl/sam.py | 2 + llava/data/datasets_mixture.py | 4 +- llava/data/simple_video_dataset.py | 21 +++-- llava/data/simple_vila_webdataset.py | 2 +- llava/data_aug/srun_reformat.sh | 2 +- llava/data_aug/video_inference.py | 2 +- pyproject.toml | 2 +- 12 files changed, 122 insertions(+), 14 deletions(-) create mode 100644 CIs/nvscope.py diff --git a/CIs/nvscope.py b/CIs/nvscope.py new file mode 100644 index 00000000..4d6ea08f --- /dev/null +++ b/CIs/nvscope.py @@ -0,0 +1,92 @@ +import os +import os.path as osp +import re +import subprocess +import sys + +r_on = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*on[\t\s]*""") +r_off = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*off[\t\s]*""") + +""" +find all files btw #nvcode:on and #nvcode:off +""" + + +def filter_nvcode(file_path): + with open(file_path) as f: + lines = f.readlines() + + new_lines = [] + new_idx = [] + skip_flag = False + for i, line in enumerate(lines): + if r_off.findall(line): + skip_flag = False + continue + + if skip_flag: + continue + + if r_on.findall(line): + skip_flag = True + continue + new_lines.append(line) + new_idx.append(i) + + return lines, new_idx, new_lines + + +def iterate_py_files(directory): + py_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith((".py", ".sh")): + py_files.append(os.path.join(root, file)) + return py_files + + +def get_authors(file_path): + cmd = f"git log --format='%aN' '{file_path}' | sort -u" + output = subprocess.check_output(cmd, shell=True, stderr=None).decode("utf-8") + authors = output.strip().split("\n") + return authors + + +def check_rule(line): + if "/lustre/fsw" in line or "/home" in line: + return True + + if "nvr_elm_llm" in line or "llmservice" in line or "cosmos_misc": + return True + + return False + + +def check_file_confidential_info(fpath): + lines, new_idx, new_lines = filter_nvcode(fpath) + + pass_flag = True + for idx, (_idx, line) in enumerate(zip(new_idx, new_lines)): + if "/lustre/fsw" in line or "/home" in line: + authors = get_authors(fpath) + print(f"{fpath} --- Line: {_idx} --- authors: {authors}") + print("\t", line.strip()) + pass_flag = False + + return pass_flag + + +def main(fpath="llava/data/datasets_mixture.py"): + if osp.isdir(fpath): + py_files = iterate_py_files(fpath) + for fpath in py_files: + res = check_file_confidential_info(fpath) + elif osp.isfile(fpath): + res = check_file_confidential_info(fpath) + + +if __name__ == "__main__": + import fire + + fire.Fire(main) +# print("".join(res)) diff --git a/llava/data/dataset_impl/coyo_recap.py b/llava/data/dataset_impl/coyo_recap.py index ca1d58c1..0fb70fe6 100755 --- a/llava/data/dataset_impl/coyo_recap.py +++ b/llava/data/dataset_impl/coyo_recap.py @@ -88,7 +88,10 @@ def __init__( n_samples_per_idx=n_samples_per_idx, ) if getattr(data_args, "caption_choice", None) is None: + self.caption_choice = "~/workspace/coyo-25m-recap" + # nvcode: on self.caption_choice = "/home/ligengz/workspace/coyo-25m-recap" + # nvcode: off else: self.caption_choice = data_args.caption_choice print(f"Current caption choice: {self.caption_choice}.") diff --git a/llava/data/dataset_impl/general_img_text.py b/llava/data/dataset_impl/general_img_text.py index 278eca1b..3571b85f 100644 --- a/llava/data/dataset_impl/general_img_text.py +++ b/llava/data/dataset_impl/general_img_text.py @@ -215,7 +215,9 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: if __name__ == "__main__": + # nvcode: on data_path = "/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/data" + # nvcode: off dst = VILAWebDataset( data_path=osp.abspath(data_path), ) diff --git a/llava/data/dataset_impl/hiertext.py b/llava/data/dataset_impl/hiertext.py index 5cb46fa5..debd15d4 100755 --- a/llava/data/dataset_impl/hiertext.py +++ b/llava/data/dataset_impl/hiertext.py @@ -54,7 +54,9 @@ from llava.data.dataset_impl.textocr import GenericDataset, preprocess_OCR from llava.train.args import DataArguments, TrainingArguments +# nvcode: on DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/hiertext" +# nvcode: off def bbx_from_vertices_list(vertices): diff --git a/llava/data/dataset_impl/panda70m.py b/llava/data/dataset_impl/panda70m.py index 1bbc07ae..c46cb0bb 100755 --- a/llava/data/dataset_impl/panda70m.py +++ b/llava/data/dataset_impl/panda70m.py @@ -62,8 +62,10 @@ from llava.mm_utils import is_gemma_tokenizer, tokenizer_image_token from llava.train.args import DataArguments, TrainingArguments +# nvcode: on # DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m" # SPLIT = "panda70m_testing" +# nvcode: off def str2time(s): diff --git a/llava/data/dataset_impl/sam.py b/llava/data/dataset_impl/sam.py index 6fdf587c..46a1d1a2 100755 --- a/llava/data/dataset_impl/sam.py +++ b/llava/data/dataset_impl/sam.py @@ -227,6 +227,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: return dict(input_ids=input_ids, labels=targets, image=image_list) +# nvcode: on if __name__ == "__main__": data_path = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-reformat" dst = VILAWebDataset( @@ -235,3 +236,4 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: # print(dst[0]) for idx, data in enumerate(dst): print(idx, data.keys()) +# nvcode: off diff --git a/llava/data/datasets_mixture.py b/llava/data/datasets_mixture.py index 3035c584..2fa4c5b6 100755 --- a/llava/data/datasets_mixture.py +++ b/llava/data/datasets_mixture.py @@ -53,6 +53,7 @@ def add_dataset(dataset): def register_datasets_mixtures(): + # nvcode: on nvclip_5m_vfc_recap30 = Dataset( dataset_name="nvclip_5m_vfc_recap30", dataset_type="imgtxt-wds", @@ -61,9 +62,7 @@ def register_datasets_mixtures(): end_idx=30 / 100, caption_choice="/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/vfc_longcaption_jsonraw", ) - add_dataset(nvclip_5m_vfc_recap30) - nvclip_5m_vfc_recap70 = Dataset( dataset_name="nvclip_5m_vfc_recap70", dataset_type="imgtxt-wds", @@ -1551,3 +1550,4 @@ def register_datasets_mixtures(): image_path="/mnt/amlfs-01/home/fuzhaox/video_datasets_v2/sherlock/sherlock/images", ) add_dataset(osmo_sherlock) + # nvcode: off diff --git a/llava/data/simple_video_dataset.py b/llava/data/simple_video_dataset.py index ef3a2758..eb7355d7 100755 --- a/llava/data/simple_video_dataset.py +++ b/llava/data/simple_video_dataset.py @@ -44,9 +44,11 @@ def load_tarfile(tar_path): return tarfile.open(tar_path) +# nvcode: on # INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video" INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m" CACHEDIR = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m-webds-meta" +# nvcode: off def process_tarfile(tar_abspath, tar_meta_path, cache_dir): @@ -102,15 +104,20 @@ def process_tarfile(tar_abspath, tar_meta_path, cache_dir): class SimpleVideoDataset(torch.utils.data.Dataset): def __init__( self, - data_path="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt", - # cache_dir="/home/ligengz/.cache/simplecoyo", - # cache_dir="/lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/datasets/captioning/vila-webds-meta", - cache_dir="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta", + data_path=None, # required + cache_dir=None, # required meta_path=None, - # image_load_mode="pil", # pil / rawbytes / fpath, max_shards_to_load=None, overwrite=False, ): + # nvcode: on + data_path = ( + "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt", + ) + cache_dir = ( + "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta", + ) + # nvcode: off self.data_path = data_path self.meta_path = meta_path if meta_path is None: @@ -189,9 +196,7 @@ def __init__( print(f"User name: {user_name}") self.dataset = ShardListDataset( self.meta_path, - cache_dir=osp.expanduser( - f"/lustre/fsw/portfolios/nvr/users/{user_name}/cache/_wids_cache/{getpass.getuser()}-{uuid}" - ), + cache_dir=osp.expanduser(f"~/.cache/_wids_cache/{getpass.getuser()}-{uuid}"), ) def __getitem__(self, idx): diff --git a/llava/data/simple_vila_webdataset.py b/llava/data/simple_vila_webdataset.py index 60b41dda..675715f2 100755 --- a/llava/data/simple_vila_webdataset.py +++ b/llava/data/simple_vila_webdataset.py @@ -192,7 +192,7 @@ def __init__( self, data_path=COYO_25M_VILA, meta_path=None, - cache_dir="/home/ligengz/datasets/vila-webds-meta", + cache_dir=osp.join(osp.expanduser("~/.cache/vila/webds-meta")), max_shards_to_load=None, ): self.data_path = osp.expanduser(data_path) diff --git a/llava/data_aug/srun_reformat.sh b/llava/data_aug/srun_reformat.sh index 5dd05047..16c75ebb 100755 --- a/llava/data_aug/srun_reformat.sh +++ b/llava/data_aug/srun_reformat.sh @@ -1,7 +1,7 @@ +# nvcode: on JOBS_LIMIT=100 # Set your limit here ACCOUNT=${ACCOUNT:-llmservice_nlp_fm} PARTITION=${PARTITION:-cpu,cpu_long} #draco: cpu,cpu_long,batch_singlenode,grizzly,polar - src_folder=/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-raw for f in $src_folder/*.tar; do diff --git a/llava/data_aug/video_inference.py b/llava/data_aug/video_inference.py index 6eb25c4b..17727838 100755 --- a/llava/data_aug/video_inference.py +++ b/llava/data_aug/video_inference.py @@ -116,7 +116,7 @@ def eval_model(args): model_name = get_model_name_from_path(model_path) print(model_path) - video_list = list(glob.glob(osp.expanduser(osp.join(args.video_dir, "*.mp4")))) + video_list = list(glob.glob(osp.join(args.video_dir, "**/*.mp4"))) assert len(video_list) > 0, f"no video found in {args.video_dir}" tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_name, args.model_base) diff --git a/pyproject.toml b/pyproject.toml index 223a9c07..507f8820 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ eval = ["word2number", "Levenshtein", "nltk", "pywsd"] [project.urls] "Homepage" = "https://hanlab.mit.edu/projects/vila" -"Bug Tracker" = "https://github.com/Efficient-Large-Model/VILA-Internal/issues" +"Bug Tracker" = "https://github.com/NVlabs/VILA/issues" [tool.black] line-length = 120