diff --git a/app/app_settings.py b/app/app_settings.py index a545df92..c7ac73bf 100644 --- a/app/app_settings.py +++ b/app/app_settings.py @@ -9,8 +9,14 @@ class AppSettings(): self.user_manager = user_manager def get_settings(self, request): - file = self.user_manager.get_request_user_filepath( - request, "comfy.settings.json") + try: + file = self.user_manager.get_request_user_filepath( + request, + "comfy.settings.json" + ) + except KeyError as e: + logging.error("User settings not found.") + raise web.HTTPUnauthorized() from e if os.path.isfile(file): try: with open(file) as f: diff --git a/comfy/cli_args.py b/comfy/cli_args.py index f4c99623..af1fca3b 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -79,6 +79,7 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).") fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.") fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.") +fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.") parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.") diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 87d32a66..11bc5778 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -110,9 +110,13 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: + embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0] if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152: - json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json") - elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577: + if embed_shape == 729: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json") + elif embed_shape == 1024: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json") + elif embed_shape == 577: if "multi_modal_projector.linear_1.bias" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json") else: diff --git a/comfy/clip_vision_siglip_512.json b/comfy/clip_vision_siglip_512.json new file mode 100644 index 00000000..7fb93ce1 --- /dev/null +++ b/comfy/clip_vision_siglip_512.json @@ -0,0 +1,13 @@ +{ + "num_channels": 3, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 512, + "intermediate_size": 4304, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 16, + "image_mean": [0.5, 0.5, 0.5], + "image_std": [0.5, 0.5, 0.5] +} diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index ede50646..45f9e311 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -847,6 +847,7 @@ class SpatialTransformer(nn.Module): if not isinstance(context, list): context = [context] * len(self.transformer_blocks) b, c, h, w = x.shape + transformer_options["activations_shape"] = list(x.shape) x_in = x x = self.norm(x) if not self.use_linear: @@ -962,6 +963,7 @@ class SpatialVideoTransformer(SpatialTransformer): transformer_options={} ) -> torch.Tensor: _, _, h, w = x.shape + transformer_options["activations_shape"] = list(x.shape) x_in = x spatial_context = None if exists(context): diff --git a/comfy/lora_convert.py b/comfy/lora_convert.py index 05032c69..3e00b63d 100644 --- a/comfy/lora_convert.py +++ b/comfy/lora_convert.py @@ -1,4 +1,5 @@ import torch +import comfy.utils def convert_lora_bfl_control(sd): #BFL loras for Flux @@ -11,7 +12,13 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux return sd_out +def convert_lora_wan_fun(sd): #Wan Fun loras + return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"}) + + def convert_lora(sd): if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd: return convert_lora_bfl_control(sd) + if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd: + return convert_lora_wan_fun(sd) return sd diff --git a/comfy/model_base.py b/comfy/model_base.py index 8f588e2b..6bc627ae 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1000,7 +1000,6 @@ class WAN21(BaseModel): device = kwargs["device"] if image is None: - image = torch.zeros_like(noise) shape_image = list(noise.shape) shape_image[1] = extra_channels image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) @@ -1013,6 +1012,9 @@ class WAN21(BaseModel): if not self.image_to_video or extra_channels == image.shape[1]: return image + if image.shape[1] > (extra_channels - 4): + image = image[:, :(extra_channels - 4)] + mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) if mask is None: mask = torch.zeros_like(noise)[:, :4] diff --git a/comfy/model_management.py b/comfy/model_management.py index fa9515c4..90785c2c 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -851,6 +851,8 @@ def text_encoder_dtype(device=None): return torch.float8_e5m2 elif args.fp16_text_enc: return torch.float16 + elif args.bf16_text_enc: + return torch.bfloat16 elif args.fp32_text_enc: return torch.float32 @@ -1263,6 +1265,8 @@ def soft_empty_cache(force=False): torch.xpu.empty_cache() elif is_ascend_npu(): torch.npu.empty_cache() + elif is_mlu(): + torch.mlu.empty_cache() elif torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() diff --git a/comfy/sd.py b/comfy/sd.py index d096f496..4d3aef3e 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -265,6 +265,7 @@ class VAE: self.process_input = lambda image: image * 2.0 - 1.0 self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0) self.working_dtypes = [torch.bfloat16, torch.float32] + self.disable_offload = False self.downscale_index_formula = None self.upscale_index_formula = None @@ -337,6 +338,7 @@ class VAE: self.process_output = lambda audio: audio self.process_input = lambda audio: audio self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + self.disable_offload = True elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae if "blocks.2.blocks.3.stack.5.weight" in sd: sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."}) @@ -515,7 +517,7 @@ class VAE: pixel_samples = None try: memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) free_memory = model_management.get_free_memory(self.device) batch_number = int(free_memory / memory_used) batch_number = max(1, batch_number) @@ -544,7 +546,7 @@ class VAE: def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None): self.throw_exception_if_invalid() memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile - model_management.load_models_gpu([self.patcher], memory_required=memory_used) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) dims = samples.ndim - 2 args = {} if tile_x is not None: @@ -578,7 +580,7 @@ class VAE: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) try: memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) free_memory = model_management.get_free_memory(self.device) batch_number = int(free_memory / max(1, memory_used)) batch_number = max(1, batch_number) @@ -612,7 +614,7 @@ class VAE: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile - model_management.load_models_gpu([self.patcher], memory_required=memory_used) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) args = {} if tile_x is not None: diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index 1ca7c2fe..5adc6b65 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -244,7 +244,7 @@ def save_glb(vertices, faces, filepath, metadata=None): Parameters: vertices: torch.Tensor of shape (N, 3) - The vertex coordinates - faces: torch.Tensor of shape (M, 4) or (M, 3) - The face indices (quad or triangle faces) + faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces) filepath: str - Output filepath (should end with .glb) """ diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index fdc6c7c1..52588920 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -446,10 +446,9 @@ class LTXVPreprocess: CATEGORY = "image" def preprocess(self, image, img_compression): - if img_compression > 0: - output_images = [] - for i in range(image.shape[0]): - output_images.append(preprocess(image[i], img_compression)) + output_images = [] + for i in range(image.shape[0]): + output_images.append(preprocess(image[i], img_compression)) return (torch.stack(output_images),) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 63fd13b9..13d2b4ba 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -2,6 +2,7 @@ import numpy as np import scipy.ndimage import torch import comfy.utils +import node_helpers from nodes import MAX_RESOLUTION @@ -87,6 +88,7 @@ class ImageCompositeMasked: CATEGORY = "image" def composite(self, destination, source, x, y, resize_source, mask = None): + destination, source = node_helpers.image_alpha_fix(destination, source) destination = destination.clone().movedim(-1, 1) output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1) return (output,) diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index 68f6ef51..5b954201 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -6,7 +6,7 @@ import math import comfy.utils import comfy.model_management - +import node_helpers class Blend: def __init__(self): @@ -34,6 +34,7 @@ class Blend: CATEGORY = "image/postprocessing" def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str): + image1, image2 = node_helpers.image_alpha_fix(image1, image2) image2 = image2.to(image1.device) if image1.shape != image2.shape: image2 = image2.permute(0, 3, 1, 2) diff --git a/node_helpers.py b/node_helpers.py index 48da3b09..c3e1a14c 100644 --- a/node_helpers.py +++ b/node_helpers.py @@ -44,3 +44,11 @@ def string_to_torch_dtype(string): return torch.float16 if string == "bf16": return torch.bfloat16 + +def image_alpha_fix(destination, source): + if destination.shape[-1] < source.shape[-1]: + source = source[...,:destination.shape[-1]] + elif destination.shape[-1] > source.shape[-1]: + destination = torch.nn.functional.pad(destination, (0, 1)) + destination[..., -1] = 1.0 + return destination, source diff --git a/nodes.py b/nodes.py index 30f6a30f..83fa97e4 100644 --- a/nodes.py +++ b/nodes.py @@ -786,6 +786,8 @@ class ControlNetLoader: def load_controlnet(self, control_net_name): controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name) controlnet = comfy.controlnet.load_controlnet(controlnet_path) + if controlnet is None: + raise RuntimeError("ERROR: controlnet file is invalid and does not contain a valid controlnet model.") return (controlnet,) class DiffControlNetLoader: @@ -1006,6 +1008,8 @@ class CLIPVisionLoader: def load_clip(self, clip_name): clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name) clip_vision = comfy.clip_vision.load(clip_path) + if clip_vision is None: + raise RuntimeError("ERROR: clip vision file is invalid and does not contain a valid vision model.") return (clip_vision,) class CLIPVisionEncode: diff --git a/requirements.txt b/requirements.txt index c78d3c22..806fbc75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.14.5 +comfyui-frontend-package==1.14.6 torch torchsde torchvision