Update DirectML (#667)

* torch 2 * update patches
carson-katri · Jul 2, 2023 · cca6960 · cca6960
1 parent 52960e9
commit cca6960
Show file tree

Hide file tree

Showing 9 changed files with 30 additions and 47 deletions.
diff --git a/generator_process/actions/control_net.py b/generator_process/actions/control_net.py
@@ -465,7 +465,7 @@ def __call__(
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             generator = []
             for _ in range(batch_size):
-                gen = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+                gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
                 generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed))
             if batch_size == 1:
                 # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909
@@ -510,7 +510,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with (torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext()), \
+            with (torch.inference_mode() if device not in ('mps', "dml") else nullcontext()), \
                 (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()):
                 yield from pipe(
                     prompt=prompt,

diff --git a/generator_process/actions/depth_to_image.py b/generator_process/actions/depth_to_image.py
@@ -340,7 +340,7 @@ def __call__(
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             generator = []
             for _ in range(batch_size):
-                gen = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+                gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
                 generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed))
             if batch_size == 1:
                 # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909
@@ -371,7 +371,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+            with torch.inference_mode() if device not in ('mps', "dml") else nullcontext():
                 yield from pipe(
                     prompt=prompt,
                     depth_image=depth_image,

diff --git a/generator_process/actions/image_to_image.py b/generator_process/actions/image_to_image.py
@@ -135,8 +135,6 @@ def __call__(
                     # TODO: Add UI to enable this
                     # 10. Run safety checker
                     # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
-
-                    image = self.image_processor.postprocess(image, output_type=output_type)
 
                     # Offload last model to CPU
                     if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
@@ -145,7 +143,7 @@ def __call__(
                     # NOTE: Modified to yield the decoded image as a numpy array.
                     yield ImageGenerationResult(
                         [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.
-                            for i, image in enumerate(image)],
+                            for i, image in enumerate(self.numpy_to_pil(image))],
                         [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()],
                         num_inference_steps,
                         True
@@ -166,7 +164,7 @@ def __call__(
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             generator = []
             for _ in range(batch_size):
-                gen = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+                gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
                 generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed))
             if batch_size == 1:
                 # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909
@@ -190,7 +188,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+            with torch.inference_mode() if device not in ('mps', "dml") else nullcontext():
                 yield from pipe(
                     prompt=prompt,
                     image=[init_image] * batch_size,

diff --git a/generator_process/actions/inpaint.py b/generator_process/actions/inpaint.py
@@ -207,7 +207,7 @@ def __call__(
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             generator = []
             for _ in range(batch_size):
-                gen = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+                gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
                 generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed))
             if batch_size == 1:
                 # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909
@@ -223,7 +223,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+            with torch.inference_mode() if device not in ('mps', "dml") else nullcontext():
                 match inpaint_mask_src:
                     case 'alpha':
                         mask_image = ImageOps.invert(init_image.getchannel('A'))

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
@@ -151,10 +151,10 @@ class Optimizations:
     cudnn_benchmark: Annotated[bool, "cuda"] = False
     tf32: Annotated[bool, "cuda"] = False
     amp: Annotated[bool, "cuda"] = False
-    half_precision: Annotated[bool, {"cuda", "privateuseone"}] = True
-    cpu_offload: Annotated[str, {"cuda", "privateuseone"}] = "off"
+    half_precision: Annotated[bool, {"cuda", "dml"}] = True
+    cpu_offload: Annotated[str, {"cuda", "dml"}] = "off"
     channels_last_memory_format: bool = False
-    sdp_attention: Annotated[bool, {"cpu", "cuda", "mps"}] = True
+    sdp_attention: bool = True
     batch_size: int = 1
     vae_slicing: bool = True
     vae_tiling: str = "off"
@@ -169,7 +169,7 @@ def infer_device() -> str:
         if sys.platform == "darwin":
             return "mps"
         elif Pipeline.directml_available():
-            return "privateuseone"
+            return "dml"
         else:
             return "cuda"
 
@@ -277,7 +277,7 @@ def apply(self, pipeline, device):
         except: pass
 
         from .. import directml_patches
-        if device == "privateuseone":
+        if device == "dml":
             directml_patches.enable(pipeline)
         else:
             directml_patches.disable(pipeline)
@@ -380,8 +380,8 @@ def choose_device(self) -> str:
     if Pipeline.directml_available():
         import torch_directml
         if torch_directml.is_available():
-            # can be named better when torch.utils.rename_privateuse1_backend() is released
-            return "privateuseone"
+            torch.utils.rename_privateuse1_backend("dml")
+            return "dml"
     return "cpu"
 
 def approximate_decoded_latents(latents):
@@ -600,7 +600,7 @@ def __call__(
             batch_size = len(prompt) if isinstance(prompt, list) else 1
             generator = []
             for _ in range(batch_size):
-                gen = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+                gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
                 generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed))
             if batch_size == 1:
                 # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909
@@ -611,7 +611,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+            with torch.inference_mode() if device not in ('mps', "dml") else nullcontext():
                 yield from pipe(
                     prompt=prompt,
                     height=height,
@@ -672,7 +672,7 @@ def _conv_forward_asymmetric(self, input, weight, bias):
     """
     Patch for Conv2d._conv_forward that supports asymmetric padding
     """
-    if input.device.type == "privateuseone":
+    if input.device.type == "dml":
         # DML pad() will wrongly fill the tensor in constant mode with the supplied value
         # (default 0) when padding on both ends of a dimension, can't split to two calls.
         working = nn.functional.pad(input, self._reversed_padding_repeated_twice, mode='circular')

diff --git a/generator_process/actions/upscale.py b/generator_process/actions/upscale.py
@@ -55,7 +55,7 @@ def upscale(
         pipe = pipe.to(device)
     pipe = optimizations.apply(pipe, device)
 
-    generator = torch.Generator(device="cpu" if device in ("mps", "privateuseone") else device) # MPS and DML do not support the `Generator` API
+    generator = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API
     if seed is None:
         seed = random.randrange(0, np.iinfo(np.uint32).max)
 

diff --git a/generator_process/directml_patches.py b/generator_process/directml_patches.py
@@ -6,18 +6,8 @@
 active_dml_patches: list | None = None
 
 
-def tensor_ensure_device(self, other, *, pre_patch):
-    """Fix for operations where one tensor is DML and the other is CPU."""
-    if isinstance(other, Tensor) and self.device != other.device:
-        if self.device.type != "cpu":
-            other = other.to(self.device)
-        else:
-            self = self.to(other.device)
-    return pre_patch(self, other)
-
-
 def baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None, pre_patch):
-    if input.device.type == "privateuseone" and beta == 0:
+    if input.device.type == "dml" and beta == 0:
         if out is not None:
             torch.bmm(batch1, batch2, out=out)
             out *= alpha
@@ -27,7 +17,7 @@ def baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None, pre_patch):
 
 
 def pad(input, pad, mode="constant", value=None, *, pre_patch):
-    if input.device.type == "privateuseone" and mode == "constant":
+    if input.device.type == "dml" and mode == "constant":
         pad_dims = torch.tensor(pad, dtype=torch.int32).view(-1, 2).flip(0)
         both_ends = False
         for pre, post in pad_dims:
@@ -49,7 +39,7 @@ def pad(input, pad, mode="constant", value=None, *, pre_patch):
 
 
 def getitem(self, key, *, pre_patch):
-    if isinstance(key, Tensor) and "privateuseone" in [self.device.type, key.device.type] and key.numel() == 1:
+    if isinstance(key, Tensor) and "dml" in [self.device.type, key.device.type] and key.numel() == 1:
         return pre_patch(self, int(key))
     return pre_patch(self, key)
 
@@ -72,15 +62,8 @@ def dml_patch_method(object, name, patched):
 
     # Not all places where the patches have an effect are necessarily listed.
 
-    # PNDMScheduler.step()
-    dml_patch_method(Tensor, "__mul__", tensor_ensure_device)
-    # PNDMScheduler.step()
-    dml_patch_method(Tensor, "__sub__", tensor_ensure_device)
-    # DDIMScheduler.step() last timestep in image_to_image
-    dml_patch_method(Tensor, "__truediv__", tensor_ensure_device)
-
-    # CrossAttention.get_attention_scores()
-    # AttentionBlock.forward()
+    # diffusers.models.attention_processor.Attention.get_attention_scores()
+    # diffusers.models.attention.AttentionBlock.forward()
     # Diffusers implementation gives torch.empty() tensors with beta=0 to baddbmm(), which may contain NaNs.
     # DML implementation doesn't properly ignore input argument with beta=0 and causes NaN propagation.
     dml_patch(torch, "baddbmm", baddbmm)
@@ -105,7 +88,9 @@ def nan_check(key, x):
                 nan_check(i, v)
             for k, v in kwargs.items():
                 nan_check(k, v)
-            return original(*args, **kwargs)
+            r = original(*args, **kwargs)
+            nan_check("return", r)
+            return r
         module.forward = func.__get__(module)
 
     # only enable when testing

diff --git a/generator_process/models/upscale_tiler.py b/generator_process/models/upscale_tiler.py
@@ -236,7 +236,7 @@ def _conv_forward_asymmetric(self, input, weight, bias):
     """
     Patch for Conv2d._conv_forward that supports asymmetric padding
     """
-    if input.device.type == "privateuseone":
+    if input.device.type == "dml":
         # DML pad() will wrongly fill the tensor in constant mode with the supplied value
         # (default 0) when padding on both ends of a dimension, can't split to two calls.
         working = nn.functional.pad(input, self._reversed_padding_repeated_twice, mode='circular')

diff --git a/requirements/win-dml.txt b/requirements/win-dml.txt
@@ -3,8 +3,8 @@ transformers
 accelerate
 huggingface_hub
 
-torch>=1.13
 torch-directml
+torch>=2.0
 
 # Original SD checkpoint conversion
 pytorch-lightning