diff --git a/README.MD b/README.MD index 3fa16a6..b8dbe29 100644 --- a/README.MD +++ b/README.MD @@ -150,6 +150,7 @@ Please try downgrading the ```protobuf``` dependency package to 3.20.3, or set e **If the dependency package error after updating, please double clicking ```repair_dependency.bat``` (for Official ComfyUI Protable) or ```repair_dependency_aki.bat``` (for ComfyUI-aki-v1.x) in the plugin folder to reinstall the dependency packages.
+* [SAM2Ultra](#SAM2Ultra) and [SAM2VideoUltra](#SAM2VideoUltra) nodes add support for SAM2.1 model, including [kijai](https://github.com/kijai)'s FP16 model. Download model files from [BaiduNetdisk](https://pan.baidu.com/s/1xaQYBA6ktxvAxm310HXweQ?pwd=auki) or [huggingface.co/Kijai/sam2-safetensors](https://huggingface.co/Kijai/sam2-safetensors/tree/main) and copy to ```ComfyUI/models/sam2``` folder. * Commit [JoyCaption2Split](#JoyCaption2Split) and [LoadJoyCaption2Model](#LoadJoyCaption2Model) nodes, Sharing the model across multiple JoyCaption2 nodes improves efficiency. * [SegmentAnythingUltra](#SegmentAnythingUltra) and [SegmentAnythingUltraV2](#SegmentAnythingUltraV2) add the ```cache_model``` option, Easy to flexibly manage VRAM usage. @@ -2487,9 +2488,9 @@ Node Options: ![image](image/segformer_fashion_example.jpg) Using the segformer model to segment clothing with ultra-high edge details. Currently supports segformer b2 clothes, segformer b3 clothes and segformer b3 fashion。 -*Download modelfiles from [huggingface](https://huggingface.co/mattmdjaga/segformer_b2_clothes/tree/main) or [百度网盘](https://pan.baidu.com/s/1OK-HfCNyZWux5iQFANq9Rw?pwd=haxg) to ```ComfyUI/models/segformer_b2_clothes``` folder. -*Download modelfiles from [huggingface](https://huggingface.co/sayeed99/segformer_b3_clothes/tree/main) or [百度网盘](https://pan.baidu.com/s/18KrCqNqUwmoJlqgAGDTw9g?pwd=ap4z) to ```ComfyUI/models/segformer_b3_clothes``` folder. -*Download modelfiles from [huggingface](https://huggingface.co/sayeed99/segformer-b3-fashion/tree/main) or [百度网盘](https://pan.baidu.com/s/10vd5PmJLFNWXaRVGW6tSvA?pwd=xzqi) to ```ComfyUI/models/segformer_b3_fashion``` folder. +*Download modelfiles from [huggingface](https://huggingface.co/mattmdjaga/segformer_b2_clothes/tree/main) or [BaiduNetdisk](https://pan.baidu.com/s/1OK-HfCNyZWux5iQFANq9Rw?pwd=haxg) to ```ComfyUI/models/segformer_b2_clothes``` folder. +*Download modelfiles from [huggingface](https://huggingface.co/sayeed99/segformer_b3_clothes/tree/main) or [BaiduNetdisk](https://pan.baidu.com/s/18KrCqNqUwmoJlqgAGDTw9g?pwd=ap4z) to ```ComfyUI/models/segformer_b3_clothes``` folder. +*Download modelfiles from [huggingface](https://huggingface.co/sayeed99/segformer-b3-fashion/tree/main) or [BaiduNetdisk](https://pan.baidu.com/s/10vd5PmJLFNWXaRVGW6tSvA?pwd=xzqi) to ```ComfyUI/models/segformer_b3_fashion``` folder. Node Options: ![image](image/segformer_ultra_v2_node.jpg) diff --git a/README_CN.MD b/README_CN.MD index 847a65a..a54c8dd 100644 --- a/README_CN.MD +++ b/README_CN.MD @@ -127,6 +127,7 @@ If this call came from a _pb2.py file, your generated code is out of date and mu ## 更新说明 **如果本插件更新后出现依赖包错误,请双击运行插件目录下的```install_requirements.bat```(官方便携包),或 ```install_requirements_aki.bat```(秋叶整合包) 重新安装依赖包。 +* [SAM2Ultra](#SAM2Ultra) 及 [SAM2VideoUltra](#SAM2VideoUltra) 节点增加支持SAM2.1模型,包括[kijai](https://github.com/kijai)量化版fp16模型。请从请从[百度网盘](https://pan.baidu.com/s/1xaQYBA6ktxvAxm310HXweQ?pwd=auki) 或者 [huggingface.co/Kijai/sam2-safetensors](https://huggingface.co/Kijai/sam2-safetensors/tree/main)下载模型文件并复制到```ComfyUI/models/sam2```文件夹。 * 添加 [JoyCaption2Split](#JoyCaption2Split) 和 [LoadJoyCaption2Model](#LoadJoyCaption2Model) 节点,在多个JoyCaption2节点时共用模型提高效率。 * [SegmentAnythingUltra](#SegmentAnythingUltra) 和 [SegmentAnythingUltraV2](#SegmentAnythingUltraV2) 增加 ```cache_model``` 参数,便于灵活管理显存。 * 鉴于[LlamaVision](#LlamaVision)节点对 ```transformers``` 的要求版本较高而影响某些旧版第三方插件的加载,LayerStyle 插件已将默认要求降低到4.43.2, 如有运行LlamaVision的需求请自行升级至4.45.0以上。 diff --git a/py/sam_2_ultra.py b/py/sam_2_ultra.py index b355761..b014f99 100644 --- a/py/sam_2_ultra.py +++ b/py/sam_2_ultra.py @@ -4,6 +4,7 @@ import comfy.model_management as mm from comfy.utils import ProgressBar from comfy.utils import load_torch_file +from contextlib import nullcontext from .imagefunc import * def bboxes2coordinates(bboxes:list) -> list: @@ -11,7 +12,9 @@ def bboxes2coordinates(bboxes:list) -> list: for bbox in bboxes: coordinates.append(((bbox[0]+bbox[2]) // 2, (bbox[1]+bbox[3]) // 2)) return coordinates + def load_model(model_path, model_cfg_path, segmentor, dtype, device): + # import yaml from .sam2.modeling.sam2_base import SAM2Base from .sam2.modeling.backbones.image_encoder import ImageEncoder from .sam2.modeling.backbones.hieradet import Hiera @@ -20,9 +23,11 @@ def load_model(model_path, model_cfg_path, segmentor, dtype, device): from .sam2.modeling.memory_attention import MemoryAttention, MemoryAttentionLayer from .sam2.modeling.sam.transformer import RoPEAttention from .sam2.modeling.memory_encoder import MemoryEncoder, MaskDownSampler, Fuser, CXBlock + from .sam2.sam2_image_predictor import SAM2ImagePredictor from .sam2.sam2_video_predictor import SAM2VideoPredictor from .sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator + # from comfy.utils import load_torch_file # Load the YAML configuration with open(model_cfg_path, 'r') as file: @@ -51,14 +56,9 @@ def load_model(model_path, model_cfg_path, segmentor, dtype, device): fpn_interp_model=neck_config['fpn_interp_model'] ) - trunk = Hiera( - embed_dim=trunk_config['embed_dim'], - num_heads=trunk_config['num_heads'], - stages=trunk_config['stages'], - global_att_blocks=trunk_config['global_att_blocks'], - window_pos_embed_bkg_spatial_size=trunk_config['window_pos_embed_bkg_spatial_size'] - - ) + keys_to_include = ['embed_dim', 'num_heads', 'global_att_blocks', 'window_pos_embed_bkg_spatial_size', 'stages'] + trunk_kwargs = {key: trunk_config[key] for key in keys_to_include if key in trunk_config} + trunk = Hiera(**trunk_kwargs) image_encoder = ImageEncoder( scalp=model_config['image_encoder']['scalp'], @@ -179,6 +179,9 @@ def initialize_model(model_class, model_config, segmentor, image_encoder, memory multimask_min_pt_num=model_config['multimask_min_pt_num'], multimask_max_pt_num=model_config['multimask_max_pt_num'], use_mlp_for_obj_ptr_proj=model_config['use_mlp_for_obj_ptr_proj'], + proj_tpos_enc_in_obj_ptrs=model_config['proj_tpos_enc_in_obj_ptrs'], + no_obj_embed_spatial=model_config['no_obj_embed_spatial'], + use_signed_tpos_enc_to_obj_ptrs=model_config['use_signed_tpos_enc_to_obj_ptrs'], binarize_mask_from_pts_for_mem_enc=True if segmentor == 'video' else False, ).to(dtype).to(device).eval() @@ -202,19 +205,27 @@ def initialize_model(model_class, model_config, segmentor, image_encoder, memory model = SAM2AutomaticMaskGenerator(model) else: raise ValueError(f"Segmentor {segmentor} not supported") + return model + class LS_SAM2_ULTRA: - model_path = os.path.join(folder_paths.models_dir, 'sam2') - model_dict = get_files(model_path, ['safetensors']) def __init__(self): self.NODE_NAME = 'SAM2 Ultra' pass @classmethod def INPUT_TYPES(cls): - sam2_model_list = list(cls.model_dict.keys()) + sam2_model_list = ['sam2_hiera_base_plus.safetensors', + 'sam2_hiera_large.safetensors', + 'sam2_hiera_small.safetensors', + 'sam2_hiera_tiny.safetensors', + 'sam2.1_hiera_base_plus.safetensors', + 'sam2.1_hiera_large.safetensors', + 'sam2.1_hiera_small.safetensors', + 'sam2.1_hiera_tiny.safetensors', + ] model_precision_list = [ 'fp16','bf16','fp32'] select_list = ["all", "first", "by_index"] method_list = ['VITMatte', 'VITMatte(local)', 'PyMatting', 'GuidedFilter', ] @@ -257,32 +268,55 @@ def sam2_ultra(self, image, bboxes, sam2_model, precision, # load model sam2_path = os.path.join(folder_paths.models_dir, "sam2") + if precision != 'fp32' and "2.1" in sam2_model: + base_name, extension = sam2_model.rsplit('.', 1) + sam2_model = f"{base_name}-fp16.{extension}" model_path = os.path.join(sam2_path, sam2_model) + if device == "cuda": if torch.cuda.get_device_properties(0).major >= 8: # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] - sam2_device = {"cuda": torch.device("cuda"), "cpu": torch.device("cpu")}[device] + # device = {"cuda": torch.device("cuda"), "cpu": torch.device("cpu")}[device] segmentor = 'single_image' + if not os.path.exists(model_path): + log(f"{self.NODE_NAME}: Downloading SAM2 model to: {model_path}") + from huggingface_hub import snapshot_download + snapshot_download(repo_id="Kijai/sam2-safetensors", + allow_patterns=[f"*{sam2_model}*"], + local_dir=sam2_path, + local_dir_use_symlinks=False) + model_mapping = { - "base": "sam2_hiera_b+.yaml", - "large": "sam2_hiera_l.yaml", - "small": "sam2_hiera_s.yaml", - "tiny": "sam2_hiera_t.yaml" + "2.0": { + "base": "sam2_hiera_b+.yaml", + "large": "sam2_hiera_l.yaml", + "small": "sam2_hiera_s.yaml", + "tiny": "sam2_hiera_t.yaml" + }, + "2.1": { + "base": "sam2.1_hiera_b+.yaml", + "large": "sam2.1_hiera_l.yaml", + "small": "sam2.1_hiera_s.yaml", + "tiny": "sam2.1_hiera_t.yaml" + } } + version = "2.1" if "2.1" in sam2_model else "2.0" model_cfg_path = next( - (os.path.join(os.path.dirname(os.path.abspath(__file__)), "sam2", "sam2_configs", cfg) for key, cfg in model_mapping.items() if key in sam2_model), + (os.path.join(os.path.dirname(os.path.abspath(__file__)), "sam2", "sam2_configs", cfg) + for key, cfg in model_mapping[version].items() if key in sam2_model), None ) + log(f"{self.NODE_NAME}: Using model config: {model_cfg_path}") - model = load_model(model_path, model_cfg_path, segmentor, dtype, sam2_device) + model = load_model(model_path, model_cfg_path, segmentor, dtype, device) offload_device = mm.unet_offload_device() - B, H, W, C = image.shape + # B, H, W, C = image.shape indexs = extract_numbers(select_index) # Handle possible bboxes @@ -308,16 +342,16 @@ def sam2_ultra(self, image, bboxes, sam2_model, precision, log(f"{self.NODE_NAME} invalid bbox index {i}", message_type='warning') else: final_box = np.array(boxes_np_batch[0]) - final_labels = None + # final_labels = None mask_list = [] try: - model.to(sam2_device) + model.to(device) except: - model.model.to(sam2_device) + model.model.to(device) - autocast_condition = not mm.is_device_mps(sam2_device) - with torch.autocast(mm.get_autocast_device(sam2_device), dtype=dtype) if autocast_condition else nullcontext(): + autocast_condition = not mm.is_device_mps(device) + with torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocast_condition else nullcontext(): image_np = (image.contiguous() * 255).byte().numpy() comfy_pbar = ProgressBar(len(image_np)) @@ -339,8 +373,8 @@ def sam2_ultra(self, image, bboxes, sam2_model, precision, if out_masks.ndim == 3: sorted_ind = np.argsort(scores)[::-1] out_masks = out_masks[sorted_ind][0] # choose only the best result for now - scores = scores[sorted_ind] - logits = logits[sorted_ind] + # scores = scores[sorted_ind] + # logits = logits[sorted_ind] mask_list.append(np.expand_dims(out_masks, axis=0)) else: _, _, H, W = out_masks.shape @@ -434,15 +468,21 @@ def poisson_disk_sampling(mask:Image, radius:float=32, num_points:int=16) -> lis class LS_SAM2_VIDEO_ULTRA: - model_path = os.path.join(folder_paths.models_dir, 'sam2') - model_dict = get_files(model_path, ['safetensors']) + def __init__(self): self.NODE_NAME = 'SAM2 Video Ultra' - pass @classmethod def INPUT_TYPES(cls): - sam2_model_list = list(cls.model_dict.keys()) + sam2_model_list = ['sam2_hiera_base_plus.safetensors', + 'sam2_hiera_large.safetensors', + 'sam2_hiera_small.safetensors', + 'sam2_hiera_tiny.safetensors', + 'sam2.1_hiera_base_plus.safetensors', + 'sam2.1_hiera_large.safetensors', + 'sam2.1_hiera_small.safetensors', + 'sam2.1_hiera_tiny.safetensors', + ] model_precision_list = ['fp16','bf16'] method_list = ['VITMatte'] device_list = ['cuda'] @@ -492,24 +532,49 @@ def sam2_video_ultra(self, image, sam2_model, precision, # load model sam2_path = os.path.join(folder_paths.models_dir, "sam2") + if precision != 'fp32' and "2.1" in sam2_model: + base_name, extension = sam2_model.rsplit('.', 1) + sam2_model = f"{base_name}-fp16.{extension}" model_path = os.path.join(sam2_path, sam2_model) + if device == "cuda": if torch.cuda.get_device_properties(0).major >= 8: # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] - sam2_device = {"cuda": torch.device("cuda"), "cpu": torch.device("cpu")}[device] + + if not os.path.exists(model_path): + log(f"{self.NODE_NAME}: Downloading SAM2 model to: {model_path}") + from huggingface_hub import snapshot_download + snapshot_download(repo_id="Kijai/sam2-safetensors", + allow_patterns=[f"*{sam2_model}*"], + local_dir=sam2_path, + local_dir_use_symlinks=False) + model_mapping = { - "base": "sam2_hiera_b+.yaml", - "large": "sam2_hiera_l.yaml", - "small": "sam2_hiera_s.yaml", - "tiny": "sam2_hiera_t.yaml" + "2.0": { + "base": "sam2_hiera_b+.yaml", + "large": "sam2_hiera_l.yaml", + "small": "sam2_hiera_s.yaml", + "tiny": "sam2_hiera_t.yaml" + }, + "2.1": { + "base": "sam2.1_hiera_b+.yaml", + "large": "sam2.1_hiera_l.yaml", + "small": "sam2.1_hiera_s.yaml", + "tiny": "sam2.1_hiera_t.yaml" + } } + version = "2.1" if "2.1" in sam2_model else "2.0" + model_cfg_path = next( - (os.path.join(os.path.dirname(os.path.abspath(__file__)), "sam2", "sam2_configs", cfg) for key, cfg in model_mapping.items() if key in sam2_model), + (os.path.join(os.path.dirname(os.path.abspath(__file__)), "sam2", "sam2_configs", cfg) + for key, cfg in model_mapping[version].items() if key in sam2_model), None ) + log(f"{self.NODE_NAME}: Using model config: {model_cfg_path}") + offload_device = mm.unet_offload_device() B, H, W, C = image.shape @@ -518,24 +583,24 @@ def sam2_video_ultra(self, image, sam2_model, precision, input_mask = F.interpolate(input_mask, size=(256, 256), mode="bilinear") input_mask = input_mask.squeeze(1) - autocast_condition = not mm.is_device_mps(sam2_device) + autocast_condition = not mm.is_device_mps(device) # init video model - v_model = load_model(model_path, model_cfg_path, 'video', dtype, sam2_device) + v_model = load_model(model_path, model_cfg_path, 'video', dtype, device) model_input_image_size = v_model.image_size from comfy.utils import common_upscale resized_image = common_upscale(image.movedim(-1,1), model_input_image_size, model_input_image_size, "bilinear", "disabled").movedim(1,-1) try: - v_model.to(sam2_device) + v_model.to(device) except: - v_model.model.to(sam2_device) + v_model.model.to(device) s_model = None if first_frame_mask is None: # load single_image_model - s_model = load_model(model_path, model_cfg_path, 'single_image', dtype, sam2_device) + s_model = load_model(model_path, model_cfg_path, 'single_image', dtype, device) # gen first frame mask - with torch.autocast(mm.get_autocast_device(sam2_device), dtype=dtype) if autocast_condition else nullcontext(): + with torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocast_condition else nullcontext(): f_mask = [] boxes_np_batch = [] for bbox_list in bboxes: @@ -557,14 +622,15 @@ def sam2_video_ultra(self, image, sam2_model, precision, point_labels=None, box=input_box, multimask_output=True, - mask_input=None, + # mask_input=None, + mask_input=input_mask[0].unsqueeze(0) if pre_mask is not None else None, ) if out_masks.ndim == 3: sorted_ind = np.argsort(scores)[::-1] out_masks = out_masks[sorted_ind][0] # choose only the best result for now - scores = scores[sorted_ind] - logits = logits[sorted_ind] + # scores = scores[sorted_ind] + # logits = logits[sorted_ind] f_mask.append(np.expand_dims(out_masks, axis=0)) else: _, _, H, W = out_masks.shape @@ -590,7 +656,7 @@ def sam2_video_ultra(self, image, sam2_model, precision, coords = poisson_disk_sampling(f_mask, radius=32, num_points=16) # gen video mask - with torch.autocast(mm.get_autocast_device(sam2_device), dtype=dtype) if autocast_condition else nullcontext(): + with torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocast_condition else nullcontext(): if not individual_objects: positive_point_coords = np.atleast_2d(np.array(coords)) else: @@ -610,7 +676,7 @@ def sam2_video_ultra(self, image, sam2_model, precision, mask_list = [] if hasattr(self, 'inference_state'): v_model.reset_state(self.inference_state) - self.inference_state = v_model.init_state(resized_image.permute(0, 3, 1, 2).contiguous(), H, W, device=sam2_device) + self.inference_state = v_model.init_state(resized_image.permute(0, 3, 1, 2).contiguous(), H, W, device=device) if individual_objects: for i, (coord, label) in enumerate(zip(final_coords, final_labels)): diff --git a/pyproject.toml b/pyproject.toml index cbf149a..3cf411d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "comfyui_layerstyle" description = "A set of nodes for ComfyUI it generate image like Adobe Photoshop's Layer Style. the Drop Shadow is first completed node, and follow-up work is in progress." -version = "1.0.79" +version = "1.0.80" license = "MIT" dependencies = ["numpy", "pillow", "torch", "matplotlib", "Scipy", "scikit_image", "scikit_learn", "opencv-contrib-python", "pymatting", "segment_anything", "timm", "addict", "yapf", "colour-science", "wget", "mediapipe", "loguru", "typer_config", "fastapi", "rich", "google-generativeai", "diffusers", "omegaconf", "tqdm", "transformers", "kornia", "image-reward", "ultralytics", "blend_modes", "blind-watermark", "qrcode", "pyzbar", "transparent-background", "huggingface_hub", "accelerate", "bitsandbytes", "torchscale", "wandb", "hydra-core", "psd-tools", "inference-cli[yolo-world]", "inference-gpu[yolo-world]", "onnxruntime", "peft"]