threestudio-project · voletiv · Dec 13, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/README.md b/README.md
@@ -108,6 +108,8 @@ pip install ninja
 pip install -r requirements.txt
 ```
 
+- (Optional) `tiny-cuda-nn` installation might require downgrading pip to 23.0.1
+
 - (Optional, Recommended) The best-performing models in threestudio use the newly-released T2I model [DeepFloyd IF](https://github.com/deep-floyd/IF), which currently requires signing a license agreement. If you would like to use these models, you need to [accept the license on the model card of DeepFloyd IF](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0), and login into the Hugging Face hub in the terminal by `huggingface-cli login`.
 
 - For contributors, see [here](https://github.com/threestudio-project/threestudio#contributing-to-threestudio).
@@ -517,6 +519,31 @@ python launch.py --config configs/magic123-refine-sd.yaml --train --gpu 0 data.i
 
 - If the image contains non-front-facing objects, specifying the approximate elevation and azimuth angle by setting `data.default_elevation_deg` and `data.default_azimuth_deg` can be helpful. In threestudio, top is elevation +90 and bottom is elevation -90; left is azimuth -90 and right is azimuth +90.
 
+
+### Stable Zero123
+
+**Installation**
+
+Download pretrained Stable Zero123 checkpoint `stable-zero123.ckpt` into `load/zero123` from https://huggingface.co/stabilityai/stable-zero123
+
+**Results obtained by threestudio (Stable Zero123 vs Zero123-XL)**
+![Final_video_v01](https://github.com/threestudio-project/threestudio/assets/22424247/bf2d2213-5027-489c-a6ba-1c56c14ee8b7)
+
+**Example running commands**
+
+1. Take an image of your choice, or generate it from text using your favourite AI image generator such as SDXL Turbo (https://clipdrop.co/stable-diffusion-turbo) E.g. "A simple 3D render of a friendly dog"
+2. Remove its background using Clipdrop (https://clipdrop.co/remove-background)
+3. Save to `load/images/`, preferably with `_rgba.png` as the suffix
+4. Run Zero-1-to-3 with the Stable Zero123 ckpt:
+```sh
+python launch.py --config configs/stable-zero123.yaml --train --gpu 0 data.image_path=./load/images/hamburger_rgba.png
+```
+
+**IMPORTANT NOTE: This is an experimental implementation and we're constantly improving the quality.**
+
+**IMPORTANT NOTE: This implementation extends the Zero-1-to-3 implementation below, and is heavily inspired from the Zero-1-to-3 implementation in [https://github.com/ashawkey/stable-dreamfusion](stable-dreamfusion)! `extern/ldm_zero123` is borrowed from `stable-dreamfusion/ldm`.**
+
+
 ### Zero-1-to-3 [![arXiv](https://img.shields.io/badge/arXiv-2303.11328-b31b1b.svg?style=flat-square)](https://arxiv.org/abs/2303.11328)
 
 **Installation**

diff --git a/configs/zero123_64.yaml → configs/stable-zero123.yaml b/configs/zero123_64.yaml → configs/stable-zero123.yaml
@@ -1,24 +1,25 @@
-name: "zero123"
-tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}_prog${data.random_camera.progressive_until}"
+name: "zero123-sai"
+tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}"
 exp_root_dir: "outputs"
 seed: 0
 
 data_type: "single-image-datamodule"
 data: # threestudio/data/image.py -> SingleImageDataModuleConfig
   image_path: ./load/images/hamburger_rgba.png
-  height: 128
-  width: 128
-  default_elevation_deg: 0.0
+  height: [128, 256, 512]
+  width: [128, 256, 512]
+  resolution_milestones: [200, 300]
+  default_elevation_deg: 5.0
   default_azimuth_deg: 0.0
   default_camera_distance: 3.8
   default_fovy_deg: 20.0
   requires_depth: ${cmaxgt0orcmaxgt0:${system.loss.lambda_depth},${system.loss.lambda_depth_rel}}
   requires_normal: ${cmaxgt0:${system.loss.lambda_normal}}
   random_camera: # threestudio/data/uncond.py -> RandomCameraDataModuleConfig
-    height: 64
-    width: 64
-    batch_size: 12
-    resolution_milestones: []
+    height: [64, 128, 256]
+    width: [64, 128, 256]
+    batch_size: [12, 8, 4]
+    resolution_milestones: [200, 300]
     eval_height: 512
     eval_width: 512
     eval_batch_size: 1
@@ -47,13 +48,6 @@ system:
     radius: 2.0
     normal_type: "analytic"
 
-    # the density initialization proposed in the DreamFusion paper
-    # does not work very well
-    # density_bias: "blob_dreamfusion"
-    # density_activation: exp
-    # density_blob_scale: 5.
-    # density_blob_std: 0.2
-
     # use Magic3D density initialization instead
     density_bias: "blob_magic3d"
     density_activation: softplus
@@ -88,28 +82,26 @@ system:
   renderer:
     radius: ${system.geometry.radius}
     num_samples_per_ray: 512
-    return_comp_normal: ${gt0:${system.loss.lambda_normal_smooth}}
-    return_normal_perturb: ${gt0:${system.loss.lambda_3d_normal_smooth}}
+    return_comp_normal: ${cmaxgt0:${system.loss.lambda_normal_smooth}}
+    return_normal_perturb: ${cmaxgt0:${system.loss.lambda_3d_normal_smooth}}
 
   prompt_processor_type: "dummy-prompt-processor" # Zero123 doesn't use prompts
   prompt_processor:
     pretrained_model_name_or_path: ""
     prompt: ""
 
-  guidance_type: "zero123-guidance"
+  guidance_type: "stable-zero123-guidance"
   guidance:
-    pretrained_model_name_or_path: "./load/zero123/zero123-xl.ckpt"
     pretrained_config: "./load/zero123/sd-objaverse-finetune-c_concat-256.yaml"
+    pretrained_model_name_or_path: "./load/zero123/stable_zero123.ckpt"
     vram_O: ${not:${gt0:${system.freq.guidance_eval}}}
     cond_image_path: ${data.image_path}
     cond_elevation_deg: ${data.default_elevation_deg}
     cond_azimuth_deg: ${data.default_azimuth_deg}
     cond_camera_distance: ${data.default_camera_distance}
     guidance_scale: 3.0
-    #min_step_percent: 0.02
-    min_step_percent: [0, 0.4, 0.2, 200]  # (start_iter, start_val, end_val, end_iter)
-    #max_step_percent: 0.98
-    max_step_percent: [0, 0.85, 0.5, 200]
+    min_step_percent: [50, 0.7, 0.3, 200]  # (start_iter, start_val, end_val, end_iter)
+    max_step_percent: [50, 0.98, 0.8, 200]
 
   freq:
     ref_only_steps: 0
@@ -123,16 +115,16 @@ system:
 
   loss:
     lambda_sds: 0.1
-    lambda_rgb: 500.
+    lambda_rgb: [100, 500., 1000., 400]
     lambda_mask: 50.
     lambda_depth: 0. # 0.05
     lambda_depth_rel: 0. # [0, 0, 0.05, 100]
     lambda_normal: 0. # [0, 0, 0.05, 100]
-    lambda_normal_smooth: 10.0
-    lambda_3d_normal_smooth: 10.0
+    lambda_normal_smooth: [100, 7.0, 5.0, 150, 10.0, 200]
+    lambda_3d_normal_smooth: [100, 7.0, 5.0, 150, 10.0, 200]
     lambda_orient: 1.0
-    lambda_sparsity: 0.1 # should be tweaked for every model
-    lambda_opaque: 0.1
+    lambda_sparsity: 0.5 # should be tweaked for every model
+    lambda_opaque: 0.5
 
   optimizer:
     name: Adam
@@ -142,14 +134,14 @@ system:
       eps: 1.e-8
 
 trainer:
-  max_steps: 400
+  max_steps: 600
   log_every_n_steps: 1
   num_sanity_val_steps: 0
   val_check_interval: 100
   enable_progress_bar: true
-  precision: 16-mixed
+  precision: 32
 
 checkpoint:
   save_last: true # save at each validation time
   save_top_k: -1
-  every_n_train_steps: ${trainer.max_steps}
+  every_n_train_steps: 100 # ${trainer.max_steps}
diff --git a/configs/zero123.yaml b/configs/zero123.yaml
@@ -1,5 +1,5 @@
 name: "zero123"
-tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}_prog${data.random_camera.progressive_until}"
+tag: "${data.random_camera.height}_${rmspace:${basename:${data.image_path}},_}"
 exp_root_dir: "outputs"
 seed: 0
 
@@ -9,7 +9,7 @@ data: # threestudio/data/image.py -> SingleImageDataModuleConfig
   height: [128, 256, 512]
   width: [128, 256, 512]
   resolution_milestones: [200, 300]
-  default_elevation_deg: 0.0
+  default_elevation_deg: 5.0
   default_azimuth_deg: 0.0
   default_camera_distance: 3.8
   default_fovy_deg: 20.0
@@ -111,9 +111,7 @@ system:
     cond_azimuth_deg: ${data.default_azimuth_deg}
     cond_camera_distance: ${data.default_camera_distance}
     guidance_scale: 3.0
-    #min_step_percent: 0.02
     min_step_percent: [0, 0.4, 0.2, 200]  # (start_iter, start_val, end_val, end_iter)
-    #max_step_percent: 0.98
     max_step_percent: [0, 0.85, 0.5, 200]
 
   freq:
@@ -147,7 +145,7 @@ system:
       eps: 1.e-8
 
 trainer:
-  max_steps: 400
+  max_steps: 600
   log_every_n_steps: 1
   num_sanity_val_steps: 0
   val_check_interval: 100

diff --git a/load/images/dog1.png → load/images/dog1_rgba.png b/load/images/dog1.png → load/images/dog1_rgba.png
diff --git a/load/zero123/download.sh b/load/zero123/download.sh
@@ -1 +1,4 @@
-wget https://huggingface.co/cvlab/zero123-weights/resolve/main/105000.ckpt
+# wget https://huggingface.co/cvlab/zero123-weights/resolve/main/105000.ckpt
+# mv 105000.ckpt zero123-original.ckpt
+wget https://zero123.cs.columbia.edu/assets/zero123-xl.ckpt
+# Download stable_zero123.ckpt from https://huggingface.co/stabilityai/stable-zero123
diff --git a/threestudio/models/guidance/__init__.py b/threestudio/models/guidance/__init__.py
@@ -5,6 +5,7 @@
     stable_diffusion_guidance,
     stable_diffusion_unified_guidance,
     stable_diffusion_vsd_guidance,
+    stable_zero123_guidance,
     zero123_guidance,
     zero123_unified_guidance,
 )