release tune-a-video configs

ChenyangQiQi · Apr 4, 2023 · 4ec25a9 · 4ec25a9
1 parent 1777971
commit 4ec25a9
Show file tree

Hide file tree

Showing 6 changed files with 199 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ previous works.
 </details>
 
 ## 📋 Changelog
-- 2023.04.04 Release shape editing [ckpts](https://huggingface.co/chenyangqi/), [data](https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/shape.zip) and [config](config/shape)
+- 2023.04.04 Release Enhanced Tuning-a-Video [configs](config/tune) and shape editing [ckpts](https://huggingface.co/chenyangqi/), [data](https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/shape.zip) and [config](config/shape)
 - 2023.03.31 Refine hugging face demo
 <!-- - 2023.03.27 Excited to Release [`Hugging face demo`](https://huggingface.co/spaces/chenyangqi/FateZero)! (refinement is in progress) Enjoy the fun of zero-shot video editing freely!
 - 2023.03.27 Release [`attribute editing config`](config/attribute) and 
@@ -80,10 +80,9 @@ previous works.
 
 ## 🚧 Todo
 
-- [x] Release the edit config and data for all results
+- [x] Release the edit config and data for all results, Tune-a-video optimization
 - [x] Memory and runtime profiling and Editing guidance documents
 - [x] Colab and hugging-face
-- [ ] Tune-a-video optimization
 - [ ] Release more application
 
 ## 🛡 Setup Environment
@@ -233,8 +232,22 @@ git clone https://huggingface.co/chenyangqi/swan_150
 
 Then use the commands in ['config/shape'](config/shape).
 
-## Tuning guidance to edit YOUR video
-We provided a tuning guidance to edit in-the-wild video at [here](./docs/EditingGuidance.md). The work is still in progress. Welcome to give your feedback in issues.
+For above Tune-A-Video checkpoints, we fintune stable diffusion with a synthetic negative-prompt [dataset](https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/negative_reg.zip) for regularization and low-rank conovlution for temporal-consistent generation using [tuning config](./config/tune/)
+
+<details><summary>Click for the bash command example: </summary>
+
+```
+cd ./data
+wget https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/negative_reg.zip
+unzip negative_reg
+cd ..
+accelerate launch train_tune_a_video.py --config config/tune/jeep.yaml
+```
+
+</details>
+
+## Editing guidance for YOUR video
+We provided a editing guidance for in-the-wild video [here](./docs/EditingGuidance.md). The work is still in progress. Welcome to give your feedback in issues.
 
 ## Style Editing Results with Stable Diffusion
 We show the difference between the source prompt and the target prompt in the box below each video.

diff --git a/config/tune/jeep.yaml b/config/tune/jeep.yaml
@@ -0,0 +1,58 @@
+# CUDA_VISIBLE_DEVICES=3 python train_tune_a_video.py --config config/tune/jeep.yaml
+# There is no obvious difference between v1-4 and v1-4, we just choose to tune v1-5 randomly at the beginning of this project.
+pretrained_model_path: "./ckpt/stable-diffusion-v1-5"
+
+train_dataset:
+    path: "data/shape/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside,"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    class_data_root: "data/negative_reg/car"
+    class_data_prompt: "a photo of a car"
+
+    sampling_rate: 1
+    stride: 80
+    offset: 
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        a silver jeep driving down a curvy road in the countryside,
+        a Porsche car driving down a curvy road in the countryside,
+        watercolor painting of a silver jeep driving down a curvy road in the countryside,
+
+    ]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [12734]
+    val_all_frames: False
+    num_inference_steps: 50 # 15 minutes
+    strength: 0.99
+
+trainer_pipeline_config:
+    target: video_diffusion.trainer.ddpm_trainer.DDPMTrainer
+
+test_pipeline_config:
+    target: video_diffusion.pipelines.DDIMSpatioTemporalStableDiffusionPipeline.DDIMSpatioTemporalStableDiffusionPipeline
+
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    # SparseCausalAttention_index: [-1, 1, 'first', 'last'] 
+
+enable_xformers: True
+mixed_precision: 'fp16'
+gradient_checkpointing: True
+
+train_steps: 1000
+validation_steps: 50 
+checkpointing_steps: 50
+seed: 74831
+learning_rate: 1e-5
+# prior_preservation: 1.0
+train_temporal_conv: True
diff --git a/config/tune/man_skate.yaml b/config/tune/man_skate.yaml
@@ -0,0 +1,59 @@
+# CUDA_VISIBLE_DEVICES=4 python train_tune_a_video.py --config config/tune/man_skate.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+
+train_dataset:
+    path: "./data/shape/man_skate"
+    prompt: "A man rides a wooden skateboard on the rail with a helmet and arms outstretched"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    # class_data_root: "data/negative_reg/birds"
+    # class_data_prompt: "a photo of a bird"
+
+    sampling_rate: 1
+    stride: 80
+    offset: 
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A man rides a wooden skateboard on the rail with a helmet and arms outstretched,
+
+        # foreground color and species
+        A Wonder Woman rides a wooden skateboard on the rail with cowboy hat and arms outstretched,
+        A Batman rides a wooden skateboard on the rail and arms outstretched,
+    ]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [12734]
+    val_all_frames: False
+    num_inference_steps: 50 # 15 minutes
+    strength: 0.99
+
+trainer_pipeline_config:
+    target: video_diffusion.trainer.ddpm_trainer.DDPMTrainer
+
+test_pipeline_config:
+    target: video_diffusion.pipelines.DDIMSpatioTemporalStableDiffusionPipeline.DDIMSpatioTemporalStableDiffusionPipeline
+
+model_config:
+    lora: 16
+    # temporal_downsample_time: 4
+    # SparseCausalAttention_index: [-1, 1, 'first', 'last'] 
+
+enable_xformers: True
+mixed_precision: 'fp16'
+gradient_checkpointing: True
+
+train_steps: 1000
+validation_steps: 50 # 10 minutes
+checkpointing_steps: 50
+seed: 74831
+learning_rate: 1e-5
+# prior_preservation: 1.0
+train_temporal_conv: True
diff --git a/config/tune/swan.yaml b/config/tune/swan.yaml
@@ -0,0 +1,60 @@
+# CUDA_VISIBLE_DEVICES=2 python train_tune_a_video.py --config config/tune/swan.yaml
+# There is no obvious difference between v1-4 and v1-4, we just choose to tune v1-5 randomly at the beginning of this project.
+pretrained_model_path: "./ckpt/stable-diffusion-v1-5"
+
+train_dataset:
+    path: "data/shape/swan"
+    prompt: "a black swan with a red beak swimming in a river near a wall and bushes,"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    class_data_root: "data/negative_reg/bird"
+    class_data_prompt: "a photo of a bird"
+
+    sampling_rate: 1
+    stride: 80
+    offset: 
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a black swan with a red beak swimming in a river near a wall and bushes,
+
+        # foreground color and species
+        a white duck with a yellow beak swimming in a river near a wall and bushes,
+        a pink flamingo with a red beak swimming in a river near a wall and bushes,
+    ]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [12734]
+    val_all_frames: False
+    num_inference_steps: 50 # 15 minutes
+    strength: 0.99
+
+trainer_pipeline_config:
+    target: video_diffusion.trainer.ddpm_trainer.DDPMTrainer
+
+test_pipeline_config:
+    target: video_diffusion.pipelines.DDIMSpatioTemporalStableDiffusionPipeline.DDIMSpatioTemporalStableDiffusionPipeline
+
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    # SparseCausalAttention_index: [-1, 1, 'first', 'last'] 
+
+enable_xformers: True
+mixed_precision: 'fp16'
+gradient_checkpointing: True
+
+train_steps: 1000
+validation_steps: 50 # 10 minutes
+checkpointing_steps: 50
+seed: 74831
+learning_rate: 1e-5
+# prior_preservation: 1.0
+train_temporal_conv: True
diff --git a/train_tune_a_video.py b/train_tune_a_video.py
@@ -1,7 +1,6 @@
 import os,copy
 import inspect
-from typing import Optional, List, Dict, Union
-import PIL
+from typing import Optional, Dict
 import click
 from omegaconf import OmegaConf
 
@@ -16,7 +15,6 @@
     AutoencoderKL,
     DDPMScheduler,
     DDIMScheduler,
-    UNet2DConditionModel,
 )
 from diffusers.optimization import get_scheduler
 from diffusers.utils.import_utils import is_xformers_available
@@ -77,12 +75,8 @@ def train(
     train_temporal_conv: bool = False,
     checkpointing_steps: int = 1000,
     model_config: dict={},
-    # use_train_latents: bool=False,
-    # kwr
-    # **kwargs
 ):
     args = get_function_args()
-    # args.update(kwargs)
     train_dataset_config = copy.deepcopy(train_dataset)
     time_string = get_time_string()
     if logdir is None:
@@ -142,7 +136,6 @@ def train(
 
 
     if is_xformers_available() and enable_xformers:
-    # if False:  # Disable xformers for null inversion
         try:
             pipeline.enable_xformers_memory_efficient_attention()
             print('enable xformers in the training and testing')
@@ -211,7 +204,6 @@ def train(
         weight_decay=adam_weight_decay,
         eps=adam_epsilon,
     )
-    # End of config trainable parameters in Unet and optimizer
 
 
     prompt_ids = tokenizer(
@@ -336,7 +328,6 @@ def make_data_yielder(dataloader):
 
 
     assert(train_dataset.overfit_length == 1), "Only support overfiting on a single video"
-    # batch = next(train_data_yielder)
 
 
     while step < train_steps:
@@ -387,7 +378,6 @@ def make_data_yielder(dataloader):
 
 
                     validation_sample_logger.log_sample_images(
-                        # image=rearrange(train_dataset.get_all()["images"].to(accelerator.device, dtype=weight_dtype), "c f h w -> f c h w"), # torch.Size([8, 3, 512, 512])
                         image= val_image, # torch.Size([8, 3, 512, 512])
                         pipeline=pipeline,
                         device=accelerator.device,

diff --git a/video_diffusion/data/dataset.py b/video_diffusion/data/dataset.py
@@ -34,7 +34,9 @@ def __init__(
             "right": 0,
             "top": 0,
             "bottom": 0
-        }
+        },
+        **args
+
     ):
         self.path = path
         self.images = self.get_image_list(path)