Skip to content

Commit

Permalink
Support nvscope (NVlabs#132)
Browse files Browse the repository at this point in the history
Co-authored-by: Zhijian Liu <[email protected]>
  • Loading branch information
Lyken17 and zhijian-liu authored Aug 26, 2024
1 parent d7d2686 commit 089f647
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 14 deletions.
92 changes: 92 additions & 0 deletions CIs/nvscope.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import os.path as osp
import re
import subprocess
import sys

r_on = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*on[\t\s]*""")
r_off = re.compile(r"""[\t\s]*#[\t\s]*nvcode[\t\s]*:[\t\s]*off[\t\s]*""")

"""
find all files btw #nvcode:on and #nvcode:off
"""


def filter_nvcode(file_path):
with open(file_path) as f:
lines = f.readlines()

new_lines = []
new_idx = []
skip_flag = False
for i, line in enumerate(lines):
if r_off.findall(line):
skip_flag = False
continue

if skip_flag:
continue

if r_on.findall(line):
skip_flag = True
continue
new_lines.append(line)
new_idx.append(i)

return lines, new_idx, new_lines


def iterate_py_files(directory):
py_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith((".py", ".sh")):
py_files.append(os.path.join(root, file))
return py_files


def get_authors(file_path):
cmd = f"git log --format='%aN' '{file_path}' | sort -u"
output = subprocess.check_output(cmd, shell=True, stderr=None).decode("utf-8")
authors = output.strip().split("\n")
return authors


def check_rule(line):
if "/lustre/fsw" in line or "/home" in line:
return True

if "nvr_elm_llm" in line or "llmservice" in line or "cosmos_misc":
return True

return False


def check_file_confidential_info(fpath):
lines, new_idx, new_lines = filter_nvcode(fpath)

pass_flag = True
for idx, (_idx, line) in enumerate(zip(new_idx, new_lines)):
if "/lustre/fsw" in line or "/home" in line:
authors = get_authors(fpath)
print(f"{fpath} --- Line: {_idx} --- authors: {authors}")
print("\t", line.strip())
pass_flag = False

return pass_flag


def main(fpath="llava/data/datasets_mixture.py"):
if osp.isdir(fpath):
py_files = iterate_py_files(fpath)
for fpath in py_files:
res = check_file_confidential_info(fpath)
elif osp.isfile(fpath):
res = check_file_confidential_info(fpath)


if __name__ == "__main__":
import fire

fire.Fire(main)
# print("".join(res))
3 changes: 3 additions & 0 deletions llava/data/dataset_impl/coyo_recap.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ def __init__(
n_samples_per_idx=n_samples_per_idx,
)
if getattr(data_args, "caption_choice", None) is None:
self.caption_choice = "~/workspace/coyo-25m-recap"
# nvcode: on
self.caption_choice = "/home/ligengz/workspace/coyo-25m-recap"
# nvcode: off
else:
self.caption_choice = data_args.caption_choice
print(f"Current caption choice: {self.caption_choice}.")
2 changes: 2 additions & 0 deletions llava/data/dataset_impl/general_img_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:


if __name__ == "__main__":
# nvcode: on
data_path = "/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/data"
# nvcode: off
dst = VILAWebDataset(
data_path=osp.abspath(data_path),
)
Expand Down
2 changes: 2 additions & 0 deletions llava/data/dataset_impl/hiertext.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@
from llava.data.dataset_impl.textocr import GenericDataset, preprocess_OCR
from llava.train.args import DataArguments, TrainingArguments

# nvcode: on
DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/hiertext"
# nvcode: off


def bbx_from_vertices_list(vertices):
Expand Down
2 changes: 2 additions & 0 deletions llava/data/dataset_impl/panda70m.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@
from llava.mm_utils import is_gemma_tokenizer, tokenizer_image_token
from llava.train.args import DataArguments, TrainingArguments

# nvcode: on
# DEFAULT_HIERTEXT = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/panda70m"
# SPLIT = "panda70m_testing"
# nvcode: off


def str2time(s):
Expand Down
2 changes: 2 additions & 0 deletions llava/data/dataset_impl/sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(input_ids=input_ids, labels=targets, image=image_list)


# nvcode: on
if __name__ == "__main__":
data_path = "/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-reformat"
dst = VILAWebDataset(
Expand All @@ -235,3 +236,4 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
# print(dst[0])
for idx, data in enumerate(dst):
print(idx, data.keys())
# nvcode: off
4 changes: 2 additions & 2 deletions llava/data/datasets_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def add_dataset(dataset):


def register_datasets_mixtures():
# nvcode: on
nvclip_5m_vfc_recap30 = Dataset(
dataset_name="nvclip_5m_vfc_recap30",
dataset_type="imgtxt-wds",
Expand All @@ -61,9 +62,7 @@ def register_datasets_mixtures():
end_idx=30 / 100,
caption_choice="/home/ligengz/nvr_elm_llm/dataset/nv-clip-5m/vfc_longcaption_jsonraw",
)

add_dataset(nvclip_5m_vfc_recap30)

nvclip_5m_vfc_recap70 = Dataset(
dataset_name="nvclip_5m_vfc_recap70",
dataset_type="imgtxt-wds",
Expand Down Expand Up @@ -1551,3 +1550,4 @@ def register_datasets_mixtures():
image_path="/mnt/amlfs-01/home/fuzhaox/video_datasets_v2/sherlock/sherlock/images",
)
add_dataset(osmo_sherlock)
# nvcode: off
21 changes: 13 additions & 8 deletions llava/data/simple_video_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,11 @@ def load_tarfile(tar_path):
return tarfile.open(tar_path)


# nvcode: on
# INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video"
INTERNVID = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m"
CACHEDIR = "/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v3/ego4d/ego4d_clips_tar/ego4d_1m-webds-meta"
# nvcode: off


def process_tarfile(tar_abspath, tar_meta_path, cache_dir):
Expand Down Expand Up @@ -102,15 +104,20 @@ def process_tarfile(tar_abspath, tar_meta_path, cache_dir):
class SimpleVideoDataset(torch.utils.data.Dataset):
def __init__(
self,
data_path="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt",
# cache_dir="/home/ligengz/.cache/simplecoyo",
# cache_dir="/lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/datasets/captioning/vila-webds-meta",
cache_dir="/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta",
data_path=None, # required
cache_dir=None, # required
meta_path=None,
# image_load_mode="pil", # pil / rawbytes / fpath,
max_shards_to_load=None,
overwrite=False,
):
# nvcode: on
data_path = (
"/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt",
)
cache_dir = (
"/lustre/fsw/portfolios/nvr/projects/nvr_aialgo_robogptagent/loragen_workspace/video_datasets_v2/internvid/video_data_tar/InternVid-1300K-flt-webds-meta",
)
# nvcode: off
self.data_path = data_path
self.meta_path = meta_path
if meta_path is None:
Expand Down Expand Up @@ -189,9 +196,7 @@ def __init__(
print(f"User name: {user_name}")
self.dataset = ShardListDataset(
self.meta_path,
cache_dir=osp.expanduser(
f"/lustre/fsw/portfolios/nvr/users/{user_name}/cache/_wids_cache/{getpass.getuser()}-{uuid}"
),
cache_dir=osp.expanduser(f"~/.cache/_wids_cache/{getpass.getuser()}-{uuid}"),
)

def __getitem__(self, idx):
Expand Down
2 changes: 1 addition & 1 deletion llava/data/simple_vila_webdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def __init__(
self,
data_path=COYO_25M_VILA,
meta_path=None,
cache_dir="/home/ligengz/datasets/vila-webds-meta",
cache_dir=osp.join(osp.expanduser("~/.cache/vila/webds-meta")),
max_shards_to_load=None,
):
self.data_path = osp.expanduser(data_path)
Expand Down
2 changes: 1 addition & 1 deletion llava/data_aug/srun_reformat.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# nvcode: on
JOBS_LIMIT=100 # Set your limit here
ACCOUNT=${ACCOUNT:-llmservice_nlp_fm}
PARTITION=${PARTITION:-cpu,cpu_long} #draco: cpu,cpu_long,batch_singlenode,grizzly,polar

src_folder=/lustre/fsw/portfolios/nvr/projects/nvr_elm_llm/dataset/sam-raw

for f in $src_folder/*.tar; do
Expand Down
2 changes: 1 addition & 1 deletion llava/data_aug/video_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def eval_model(args):
model_name = get_model_name_from_path(model_path)

print(model_path)
video_list = list(glob.glob(osp.expanduser(osp.join(args.video_dir, "*.mp4"))))
video_list = list(glob.glob(osp.join(args.video_dir, "**/*.mp4")))
assert len(video_list) > 0, f"no video found in {args.video_dir}"

tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_name, args.model_base)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ eval = ["word2number", "Levenshtein", "nltk", "pywsd"]

[project.urls]
"Homepage" = "https://hanlab.mit.edu/projects/vila"
"Bug Tracker" = "https://github.com/Efficient-Large-Model/VILA-Internal/issues"
"Bug Tracker" = "https://github.com/NVlabs/VILA/issues"

[tool.black]
line-length = 120
Expand Down

0 comments on commit 089f647

Please sign in to comment.