Merge branch 'master' into worksplit-multigpu

This commit is contained in:
Jedrzej Kosinski 2025-04-08 22:52:17 -05:00
commit 2fa9affcc1
16 changed files with 71 additions and 16 deletions

View File

@ -9,8 +9,14 @@ class AppSettings():
self.user_manager = user_manager self.user_manager = user_manager
def get_settings(self, request): def get_settings(self, request):
try:
file = self.user_manager.get_request_user_filepath( file = self.user_manager.get_request_user_filepath(
request, "comfy.settings.json") request,
"comfy.settings.json"
)
except KeyError as e:
logging.error("User settings not found.")
raise web.HTTPUnauthorized() from e
if os.path.isfile(file): if os.path.isfile(file):
try: try:
with open(file) as f: with open(file) as f:

View File

@ -79,6 +79,7 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).") fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.") fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.") fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.") parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

View File

@ -110,9 +110,13 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152: if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
if embed_shape == 729:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json") json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577: elif embed_shape == 1024:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
elif embed_shape == 577:
if "multi_modal_projector.linear_1.bias" in sd: if "multi_modal_projector.linear_1.bias" in sd:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json") json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
else: else:

View File

@ -0,0 +1,13 @@
{
"num_channels": 3,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"image_size": 512,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 16,
"image_mean": [0.5, 0.5, 0.5],
"image_std": [0.5, 0.5, 0.5]
}

View File

@ -847,6 +847,7 @@ class SpatialTransformer(nn.Module):
if not isinstance(context, list): if not isinstance(context, list):
context = [context] * len(self.transformer_blocks) context = [context] * len(self.transformer_blocks)
b, c, h, w = x.shape b, c, h, w = x.shape
transformer_options["activations_shape"] = list(x.shape)
x_in = x x_in = x
x = self.norm(x) x = self.norm(x)
if not self.use_linear: if not self.use_linear:
@ -962,6 +963,7 @@ class SpatialVideoTransformer(SpatialTransformer):
transformer_options={} transformer_options={}
) -> torch.Tensor: ) -> torch.Tensor:
_, _, h, w = x.shape _, _, h, w = x.shape
transformer_options["activations_shape"] = list(x.shape)
x_in = x x_in = x
spatial_context = None spatial_context = None
if exists(context): if exists(context):

View File

@ -1,4 +1,5 @@
import torch import torch
import comfy.utils
def convert_lora_bfl_control(sd): #BFL loras for Flux def convert_lora_bfl_control(sd): #BFL loras for Flux
@ -11,7 +12,13 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
return sd_out return sd_out
def convert_lora_wan_fun(sd): #Wan Fun loras
return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
def convert_lora(sd): def convert_lora(sd):
if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd: if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
return convert_lora_bfl_control(sd) return convert_lora_bfl_control(sd)
if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
return convert_lora_wan_fun(sd)
return sd return sd

View File

@ -1000,7 +1000,6 @@ class WAN21(BaseModel):
device = kwargs["device"] device = kwargs["device"]
if image is None: if image is None:
image = torch.zeros_like(noise)
shape_image = list(noise.shape) shape_image = list(noise.shape)
shape_image[1] = extra_channels shape_image[1] = extra_channels
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
@ -1013,6 +1012,9 @@ class WAN21(BaseModel):
if not self.image_to_video or extra_channels == image.shape[1]: if not self.image_to_video or extra_channels == image.shape[1]:
return image return image
if image.shape[1] > (extra_channels - 4):
image = image[:, :(extra_channels - 4)]
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
if mask is None: if mask is None:
mask = torch.zeros_like(noise)[:, :4] mask = torch.zeros_like(noise)[:, :4]

View File

@ -851,6 +851,8 @@ def text_encoder_dtype(device=None):
return torch.float8_e5m2 return torch.float8_e5m2
elif args.fp16_text_enc: elif args.fp16_text_enc:
return torch.float16 return torch.float16
elif args.bf16_text_enc:
return torch.bfloat16
elif args.fp32_text_enc: elif args.fp32_text_enc:
return torch.float32 return torch.float32
@ -1263,6 +1265,8 @@ def soft_empty_cache(force=False):
torch.xpu.empty_cache() torch.xpu.empty_cache()
elif is_ascend_npu(): elif is_ascend_npu():
torch.npu.empty_cache() torch.npu.empty_cache()
elif is_mlu():
torch.mlu.empty_cache()
elif torch.cuda.is_available(): elif torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.cuda.ipc_collect() torch.cuda.ipc_collect()

View File

@ -265,6 +265,7 @@ class VAE:
self.process_input = lambda image: image * 2.0 - 1.0 self.process_input = lambda image: image * 2.0 - 1.0
self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0) self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
self.working_dtypes = [torch.bfloat16, torch.float32] self.working_dtypes = [torch.bfloat16, torch.float32]
self.disable_offload = False
self.downscale_index_formula = None self.downscale_index_formula = None
self.upscale_index_formula = None self.upscale_index_formula = None
@ -337,6 +338,7 @@ class VAE:
self.process_output = lambda audio: audio self.process_output = lambda audio: audio
self.process_input = lambda audio: audio self.process_input = lambda audio: audio
self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
self.disable_offload = True
elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
if "blocks.2.blocks.3.stack.5.weight" in sd: if "blocks.2.blocks.3.stack.5.weight" in sd:
sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."}) sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
@ -515,7 +517,7 @@ class VAE:
pixel_samples = None pixel_samples = None
try: try:
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
model_management.load_models_gpu([self.patcher], memory_required=memory_used) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
free_memory = model_management.get_free_memory(self.device) free_memory = model_management.get_free_memory(self.device)
batch_number = int(free_memory / memory_used) batch_number = int(free_memory / memory_used)
batch_number = max(1, batch_number) batch_number = max(1, batch_number)
@ -544,7 +546,7 @@ class VAE:
def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None): def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
self.throw_exception_if_invalid() self.throw_exception_if_invalid()
memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
model_management.load_models_gpu([self.patcher], memory_required=memory_used) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
dims = samples.ndim - 2 dims = samples.ndim - 2
args = {} args = {}
if tile_x is not None: if tile_x is not None:
@ -578,7 +580,7 @@ class VAE:
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
try: try:
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
model_management.load_models_gpu([self.patcher], memory_required=memory_used) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
free_memory = model_management.get_free_memory(self.device) free_memory = model_management.get_free_memory(self.device)
batch_number = int(free_memory / max(1, memory_used)) batch_number = int(free_memory / max(1, memory_used))
batch_number = max(1, batch_number) batch_number = max(1, batch_number)
@ -612,7 +614,7 @@ class VAE:
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile
model_management.load_models_gpu([self.patcher], memory_required=memory_used) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
args = {} args = {}
if tile_x is not None: if tile_x is not None:

View File

@ -244,7 +244,7 @@ def save_glb(vertices, faces, filepath, metadata=None):
Parameters: Parameters:
vertices: torch.Tensor of shape (N, 3) - The vertex coordinates vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
faces: torch.Tensor of shape (M, 4) or (M, 3) - The face indices (quad or triangle faces) faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces)
filepath: str - Output filepath (should end with .glb) filepath: str - Output filepath (should end with .glb)
""" """

View File

@ -446,7 +446,6 @@ class LTXVPreprocess:
CATEGORY = "image" CATEGORY = "image"
def preprocess(self, image, img_compression): def preprocess(self, image, img_compression):
if img_compression > 0:
output_images = [] output_images = []
for i in range(image.shape[0]): for i in range(image.shape[0]):
output_images.append(preprocess(image[i], img_compression)) output_images.append(preprocess(image[i], img_compression))

View File

@ -2,6 +2,7 @@ import numpy as np
import scipy.ndimage import scipy.ndimage
import torch import torch
import comfy.utils import comfy.utils
import node_helpers
from nodes import MAX_RESOLUTION from nodes import MAX_RESOLUTION
@ -87,6 +88,7 @@ class ImageCompositeMasked:
CATEGORY = "image" CATEGORY = "image"
def composite(self, destination, source, x, y, resize_source, mask = None): def composite(self, destination, source, x, y, resize_source, mask = None):
destination, source = node_helpers.image_alpha_fix(destination, source)
destination = destination.clone().movedim(-1, 1) destination = destination.clone().movedim(-1, 1)
output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1) output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)
return (output,) return (output,)

View File

@ -6,7 +6,7 @@ import math
import comfy.utils import comfy.utils
import comfy.model_management import comfy.model_management
import node_helpers
class Blend: class Blend:
def __init__(self): def __init__(self):
@ -34,6 +34,7 @@ class Blend:
CATEGORY = "image/postprocessing" CATEGORY = "image/postprocessing"
def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str): def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str):
image1, image2 = node_helpers.image_alpha_fix(image1, image2)
image2 = image2.to(image1.device) image2 = image2.to(image1.device)
if image1.shape != image2.shape: if image1.shape != image2.shape:
image2 = image2.permute(0, 3, 1, 2) image2 = image2.permute(0, 3, 1, 2)

View File

@ -44,3 +44,11 @@ def string_to_torch_dtype(string):
return torch.float16 return torch.float16
if string == "bf16": if string == "bf16":
return torch.bfloat16 return torch.bfloat16
def image_alpha_fix(destination, source):
if destination.shape[-1] < source.shape[-1]:
source = source[...,:destination.shape[-1]]
elif destination.shape[-1] > source.shape[-1]:
destination = torch.nn.functional.pad(destination, (0, 1))
destination[..., -1] = 1.0
return destination, source

View File

@ -786,6 +786,8 @@ class ControlNetLoader:
def load_controlnet(self, control_net_name): def load_controlnet(self, control_net_name):
controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name) controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name)
controlnet = comfy.controlnet.load_controlnet(controlnet_path) controlnet = comfy.controlnet.load_controlnet(controlnet_path)
if controlnet is None:
raise RuntimeError("ERROR: controlnet file is invalid and does not contain a valid controlnet model.")
return (controlnet,) return (controlnet,)
class DiffControlNetLoader: class DiffControlNetLoader:
@ -1006,6 +1008,8 @@ class CLIPVisionLoader:
def load_clip(self, clip_name): def load_clip(self, clip_name):
clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name) clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name)
clip_vision = comfy.clip_vision.load(clip_path) clip_vision = comfy.clip_vision.load(clip_path)
if clip_vision is None:
raise RuntimeError("ERROR: clip vision file is invalid and does not contain a valid vision model.")
return (clip_vision,) return (clip_vision,)
class CLIPVisionEncode: class CLIPVisionEncode:

View File

@ -1,4 +1,4 @@
comfyui-frontend-package==1.14.5 comfyui-frontend-package==1.14.6
torch torch
torchsde torchsde
torchvision torchvision