text/image-to-4d

VITA-Group · Feb 14, 2024 · 6a0b2a2 · 6a0b2a2
1 parent 8bdf0cc
commit 6a0b2a2
Show file tree

Hide file tree

Showing 11 changed files with 186 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,7 @@ olddata
 output_flow/
 exp_data/
 **/.DS_Store
+arguments/i2v.py
+train.py
+guidance/zero123_utils.py
+output
diff --git a/README.md b/README.md
@@ -3,13 +3,18 @@ Authors: Yuyang Yin, Dejia Xu, Zhangyang Wang, Yao Zhao, Yunchao Wei
 
 [[Project Page]](https://vita-group.github.io/4DGen/) | [[Video (narrated)]](https://www.youtube.com/watch?v=-bXyBKdpQ1o) | [[Video (results only)]](https://www.youtube.com/watch?v=Hbava1VpeXY) | [[Paper]](https://github.com/VITA-Group/4DGen/blob/main/4dgen.pdf) | [[Arxiv]](https://arxiv.org/abs/2312.17225)
 
-![overview](https://raw.githubusercontent.com/VITA-Group/4DGen/main/docs/static/media/task.a51c143187610723eb8f.png)
+<!-- ![overview](https://raw.githubusercontent.com/VITA-Group/4DGen/main/docs/static/media/task.a51c143187610723eb8f.png) -->
 
+![overview](docs/static/media/task.png)
 
 ## News
 - `2023/12/28`  Release code and paper.
 - `2024/2/1`   Enhance the coherence of the video outputs in low fps.
-- `2024/2/1`   Update text-to-4d and image-to-4d functions and cases.
+- `2024/2/14`   Update text-to-4d and image-to-4d functions and cases.
+
+## Task Type
+As show in figure above, we define grounded 4D generation, which focuses on video-to-4D generation. Video is not required to be user-specified but can also be generated by video diffusion. With the help of [stable video diffusion](https://github.com/nateraw/stable-diffusion-videos), we implement the function  of image-to-video-to-4d  and text-to-image-to-video-to-4d . Due to the unsatisfactory performance of the text-to-video model, we use [stable diffusion-XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and [stable video diffusion](https://github.com/nateraw/stable-diffusion-videos) implement the function  of text-to-image-to-video-to-4d.
+
 
 
 
@@ -34,13 +39,26 @@ pip install kaolin -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-1.1
 
 ## Data Preparation
 
-We release our collected data in [Google Drive](https://drive.google.com/drive/folders/1-lbtj-YiA7d0Nbe6Qcc_t0W_CKKEw_bm?usp=drive_link).
+We release our collected data in [Google Drive](https://drive.google.com/drive/folders/1-lbtj-YiA7d0Nbe6Qcc_t0W_CKKEw_bm?usp=drive_link). Some of these data are user-specified, while others are generated. 
 
 Each test case contains two folders: `{name}_pose0` and `{name}_sync`. `pose0` refers to the monocular video sequence. `sync` refers to the pseudo labels generated by [SyncDreamer](https://github.com/liuyuan-pal/SyncDreamer).
 
 We recommend using [Practical-RIFE](https://github.com/hzwer/Practical-RIFE) if you need to introduce more frames in your video sequence.
 
-To preprocess your own images into RGBA format, one can use `preprocess.py` or `preprocess_sync.py`
+**Text-To-4D data prepartion**
+
+Use [stable diffusion-XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) to generate your own images. Then use image-to-video script below.
+
+**Image-To-4D data prepartion**
+```bash
+python image_to_video.py --name {image path}  #It may be necessary to try multiple seeds to obtain the desired results.
+```
+
+**Preprocess data format for training** 
+
+To preprocess your own images into RGBA format, you can use `preprocess.py` .
+
+To preprocess your own images to multi view images, you can use [SyncDreamer](https://github.com/liuyuan-pal/SyncDreamer) script，then use `preprocess_sync.py` to get a uniform format.
 
 ```bash
 # for monocular image sequence
@@ -63,6 +81,8 @@ python render.py --skip_train --configs arguments/i2v.py --skip_test --model_pat
 
 
 
+
+
 ## Evaluation
 As for CLIP loss, we calculate clip distance loss between rendered images and reference images. The refernce images are n frames. The rendered images are 10 viewpoints in each timestep. 
 
@@ -73,6 +93,15 @@ cd evaluation
 bash eval.bash  #please change file paths before running
 ```
 
+
+## Result ##
+We show part of results in our [web pages](https://vita-group.github.io/4DGen/).
+
+The text-to-4d results are below:
+
+
+
+
 ## Acknowledgement
 
 This work is built on many amazing research works and open-source projects, thanks a lot to all the authors for sharing!

diff --git a/arguments/i2v.py b/arguments/i2v.py
@@ -20,8 +20,8 @@
 )
 
 ModelParams = dict(
-    frame_num = 16,
-    name="rose",
+    frame_num = 14,
+    name="toy0",
     rife=False,
 )
 
@@ -44,6 +44,6 @@
      'grid_dimensions': 2,
      'input_coordinate_dim': 4,
      'output_coordinate_dim': 32,
-     'resolution': [64, 64, 64, 8]  #8 is frame numbers/2
+     'resolution': [64, 64, 64, 7]  #8 is frame numbers/2
     }
 )
diff --git a/docs/static/media/task.png b/docs/static/media/task.png
diff --git a/guidance/zero123_utils.py b/guidance/zero123_utils.py
@@ -25,6 +25,8 @@ def __init__(self, device, fp16=True, t_range=[0.2, 0.6], zero123_path='ashawkey
         self.device = device
         self.fp16 = fp16
         self.dtype = torch.float16 if fp16 else torch.float32
+        if os.path.exists("/data/users/yyy/Largemodelashawkey/stable-zero123-diffusers"):
+            zero123_path="/data/users/yyy/Largemodelashawkey/stable-zero123-diffusers"
         self.pipe = Zero123Pipeline.from_pretrained(            
             zero123_path,
             variant="fp16_ema" if self.fp16 else None,

diff --git a/image_to_video.py b/image_to_video.py
@@ -0,0 +1,104 @@
+import torch
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.utils import load_image, export_to_video, export_to_gif
+
+from PIL import Image
+import numpy as np
+
+import cv2
+import rembg
+import os
+import argparse
+
+def add_margin(pil_img, top, right, bottom, left, color):
+    width, height = pil_img.size
+    new_width = width + right + left
+    new_height = height + top + bottom
+    result = Image.new(pil_img.mode, (new_width, new_height), color)
+    result.paste(pil_img, (left, top))
+    return result
+
+def resize_image(image, output_size=(1024, 576)):
+    image = image.resize((output_size[1],output_size[1]))
+    pad_size = (output_size[0]-output_size[1]) //2
+    image = add_margin(image, 0, pad_size, 0, pad_size, tuple(np.array(image)[0,0]))
+    return image
+
+
+def load_image(file, W, H, bg='white'):
+    # load image
+    print(f'[INFO] load image from {file}...')
+    img = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+    bg_remover = rembg.new_session()
+    img = rembg.remove(img, session=bg_remover)
+    img = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)
+    img = img.astype(np.float32) / 255.0
+    input_mask = img[..., 3:]
+    # white bg
+    if bg == 'white':
+        input_img = img[..., :3] * input_mask + (1 - input_mask)
+    elif bg == 'black':
+        input_img = img[..., :3]
+    else:
+        raise NotImplementedError
+    # bgr to rgb
+    input_img = input_img[..., ::-1].copy()
+    input_img = Image.fromarray(np.uint8(input_img*255))
+    return input_img
+
+def load_image_w_bg(file, W, H):
+    # load image
+    print(f'[INFO] load image from {file}...')
+    img = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+    img = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)
+    img = img.astype(np.float32) / 255.0
+    input_img = img[..., :3]
+    # bgr to rgb
+    input_img = input_img[..., ::-1].copy()
+    input_img = Image.fromarray(np.uint8(input_img*255))
+    return input_img
+
+def gen_vid(data_path,name, seed, bg, is_pad):
+    pipe = StableVideoDiffusionPipeline.from_pretrained(
+        "stabilityai/stable-video-diffusion-img2vid", torch_dtype=torch.float16, variant="fp16"
+    )
+    # pipe.enable_model_cpu_offload()
+    pipe.to("cuda")
+
+    if is_pad:
+        height, width = 576, 1024
+    else:
+        height, width = 512, 512
+
+
+    save_dir=f"data/{name}_svd/{seed}"
+    os.makedirs(save_dir,exist_ok=True)
+
+    image = load_image(data_path, width, height, bg)
+    if is_pad:
+        image = resize_image(image, output_size=(width, height))
+    generator = torch.manual_seed(seed)
+    frames = pipe(image, height, width, generator=generator).frames[0]
+
+    export_to_video(frames, f"{save_dir}/{name}_generated.mp4", fps=8)
+    export_to_gif(frames, f"{save_dir}/{name}_generated.gif")
+    for idx, img in enumerate(frames):
+        if is_pad:
+            img = img.crop(((width-height) //2, 0, width - (width-height) //2, height))
+
+        img.save(f"{save_dir}/{idx}.png")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, required=True)
+    parser.add_argument("--name", type=str, required=True)
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--bg", type=str, default='white')
+    parser.add_argument("--is_pad", type=bool, default=False)
+    args, extras = parser.parse_known_args()
+    if args.seed is None:
+        for seed in range(30):
+            gen_vid(args.data_path,args.name, seed, args.bg, args.is_pad)
+    else:
+        gen_vid(args.data_path,args.name, args.seed, args.bg, args.is_pad)
diff --git a/preprocess_sync.py b/preprocess_sync.py
@@ -66,6 +66,6 @@ def chop_image_into_16(image):
                 final_rgba = carved_image
 
                 # write image
-                out_rgba = os.path.join(opt.path, out_base + f'_{idx}_rgba.png')
+                out_rgba = os.path.join(opt.path, out_base + f'_0_{idx}_rgba.png')
                 cv2.imwrite(out_rgba, final_rgba)
                 print('out path:',out_rgba)
diff --git a/scene/__init__.py b/scene/__init__.py
@@ -52,6 +52,7 @@ def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration
             ds = ImageDreamdataset
         else:
             ds = FourDGSdataset
+        print('args.frame_num:',args.frame_num)
         self.train_camera = ds(split='train', frame_num=args.frame_num,name=args.name,rife=args.rife,static=args.static)
         print("Loading Test Cameras")
         self.maxtime = self.train_camera.pose0_num

diff --git a/scene/i2v_dataset.py b/scene/i2v_dataset.py
@@ -92,12 +92,16 @@ def __init__(
 
         frame_list = range(frame_num)
         pose0_im_names = [pose0_dir + f'{x}.png' for x in frame_list]
+        print('pos0_im_names:',pose0_im_names)
         idx_list = range(frame_num)
         if not os.path.exists(pose0_im_names[0]): # check 0 index
             pose0_im_names = pose0_im_names[1:] + [pose0_dir + f'{frame_num}.png'] # use 1 index
             idx_list = list(idx_list)[1:] + [frame_num]
 
         base_dir=f'./data/{self.name}_sync'
+        if not os.path.exists(base_dir):
+            base_dir=f'./data/{self.name}_harm'
+        print('args.frame_num:',frame_num)
 
         syncdreamer_im = []
         # for fname in t0_im_names:

diff --git a/svd.py b/svd.py
@@ -0,0 +1,33 @@
+import os
+import random
+from concurrent.futures import ThreadPoolExecutor
+directory_path = '/data/users/yyy/4dgen_exp/data/harmonyview_testset'
+
+# 获取目录下所有文件
+file_list = os.listdir(directory_path)
+
+# 打印文件列表
+# for file in file_list:
+#     file_name=file.split('/')[-1].split('.')[0]
+#     file_name='harm_'+file_name
+#     print(file_name)
+#     file_path=directory_path+'/'+file
+#     cmd=f'CUDA_VISIBLE_DEVICES="1" python image_to_video.py --data_path {file_path} --name {file_name}'
+#     print(cmd)
+#     os.system(cmd)
+
+def process_file(file):
+    file_name = file.split('/')[-1].split('.')[0]
+    file_name = 'harm_' + file_name
+    print(file_name)
+
+    file_path = os.path.join(directory_path, file)
+    cuda_device = random.randint(0, 4)
+    cmd = f'CUDA_VISIBLE_DEVICES="{cuda_device}" python image_to_video.py --data_path {file_path} --name {file_name}'
+    print(cmd)
+
+    os.system(cmd)
+
+for file in file_list:
+    with ThreadPoolExecutor() as executor:
+        executor.map(process_file, file_list)
diff --git a/train.py b/train.py
@@ -36,7 +36,7 @@
 
 from PIL import Image
 from torchvision.transforms import ToTensor
-from kaolin.metrics.pointcloud import chamfer_distance
+# from kaolin.metrics.pointcloud import chamfer_distance
 from plyfile import PlyData
 
 def scene_reconstruction(dataset, opt, hyper, pipe, testing_iterations, saving_iterations,