-
Notifications
You must be signed in to change notification settings - Fork 51
/
infer.py
141 lines (118 loc) · 5.9 KB
/
infer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import cv2, os
import torch
import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener
register_heif_opener()
import pillow_heif
pillow_heif.register_avif_opener()
from diffusers.utils import load_image
from diffusers import EulerAncestralDiscreteScheduler, UniPCMultistepScheduler
from insightface.app import FaceAnalysis
from pipeline_sdxl_storymaker import StableDiffusionXLStoryMakerPipeline
def resize_img(input_image, max_side=1280, min_side=960, size=None,
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
w, h = input_image.size
if size is not None:
w_resize_new, h_resize_new = size
else:
ratio = min_side / min(h, w)
w, h = round(ratio*w), round(ratio*h)
ratio = max_side / max(h, w)
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
if pad_to_max_side:
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
offset_x = (max_side - w_resize_new) // 2
offset_y = (max_side - h_resize_new) // 2
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
input_image = Image.fromarray(res)
return input_image
# Load face encoder
app = FaceAnalysis(name='buffalo_l', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
# Path to models
face_adapter = f'checkpoints/mask.bin'
image_encoder_path = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K' # from https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
base_model_path = 'huaquan/YamerMIX_v11' # from https://huggingface.co/huaquan/YamerMIX_v11
pipe = StableDiffusionXLStoryMakerPipeline.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
)
pipe.cuda()
pipe.load_storymaker_adapter(image_encoder_path, face_adapter, scale=0.8, lora_scale=0.8)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
def demo():
prompt = "a person is taking a selfie, the person is wearing a red hat, and a volcano is in the distance"
n_prompt = "bad quality, NSFW, low quality, ugly, disfigured, deformed"
image = Image.open("examples/ldh.png").convert('RGB')
mask_image = Image.open("examples/ldh_mask.png").convert('RGB')
# image = resize_img(image)
face_info = app.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] # only use the maximum face
generator = torch.Generator(device='cuda').manual_seed(666)
for i in range(4):
output = pipe(
image=image, mask_image=mask_image, face_info=face_info,
prompt=prompt,
negative_prompt=n_prompt,
ip_adapter_scale=0.8, lora_scale=0.8,
num_inference_steps=25,
guidance_scale=7.5,
height=1280, width=960,
generator=generator,
).images[0]
output.save(f'examples/results/ldh666_{i}.jpg')
def demo_two():
prompt = "A man and a woman are taking a selfie, and a volcano is in the distance"
n_prompt = "bad quality, NSFW, low quality, ugly, disfigured, deformed"
image = Image.open("examples/ldh.png").convert('RGB')
mask_image = Image.open("examples/ldh_mask.png").convert('RGB')
image_2 = Image.open("examples/tsy.png").convert('RGB')
mask_image_2 = Image.open("examples/tsy_mask.png").convert('RGB')
face_info = app.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] # only use the maximum face
face_info_2 = app.get(cv2.cvtColor(np.array(image_2), cv2.COLOR_RGB2BGR))
face_info_2 = sorted(face_info_2, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] # only use the maximum face
generator = torch.Generator(device='cuda').manual_seed(666)
for i in range(4):
output = pipe(
image=image, mask_image=mask_image,face_info=face_info, # first person
image_2=image_2, mask_image_2=mask_image_2,face_info_2=face_info_2, # second person
prompt=prompt,
negative_prompt=n_prompt,
ip_adapter_scale=0.8, lora_scale=0.8,
num_inference_steps=25,
guidance_scale=7.5,
height=1280, width=960,
generator=generator,
).images[0]
output.save(f'examples/results/ldh_tsy666_{i}.jpg')
def demo_swapcloth():
prompt = "a person is taking a selfie, and a volcano is in the distance"
n_prompt = "bad quality, NSFW, low quality, ugly, disfigured, deformed"
image = Image.open("examples/ldh.png").convert('RGB')
mask_image = Image.open("examples/ldh_mask.png").convert('RGB')
cloth = Image.open("examples/cloth2.png").convert('RGB')
face_info = app.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] # only use the maximum face
generator = torch.Generator(device='cuda').manual_seed(666)
for i in range(4):
output = pipe(
image=image, mask_image=mask_image, face_info=face_info, cloth=cloth,
prompt=prompt,
negative_prompt=n_prompt,
ip_adapter_scale=0.8, lora_scale=0.8,
num_inference_steps=25,
guidance_scale=7.5,
height=1280, width=960,
generator=generator,
).images[0]
output.save(f'examples/results/ldh_cloth_{i}.jpg')
if __name__ == "__main__":
# single portrait generation
demo()
# two portrait generation
# demo_two()