mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-06-09 15:47:14 +00:00
Merge branch 'master' into worksplit-multigpu
This commit is contained in:
commit
2fa9affcc1
@ -9,8 +9,14 @@ class AppSettings():
|
|||||||
self.user_manager = user_manager
|
self.user_manager = user_manager
|
||||||
|
|
||||||
def get_settings(self, request):
|
def get_settings(self, request):
|
||||||
|
try:
|
||||||
file = self.user_manager.get_request_user_filepath(
|
file = self.user_manager.get_request_user_filepath(
|
||||||
request, "comfy.settings.json")
|
request,
|
||||||
|
"comfy.settings.json"
|
||||||
|
)
|
||||||
|
except KeyError as e:
|
||||||
|
logging.error("User settings not found.")
|
||||||
|
raise web.HTTPUnauthorized() from e
|
||||||
if os.path.isfile(file):
|
if os.path.isfile(file):
|
||||||
try:
|
try:
|
||||||
with open(file) as f:
|
with open(file) as f:
|
||||||
|
@ -79,6 +79,7 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
|
|||||||
fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
|
fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
|
||||||
fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
|
fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
|
||||||
fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
|
fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
|
||||||
|
fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")
|
||||||
|
|
||||||
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
|
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
|
||||||
|
|
||||||
|
@ -110,9 +110,13 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
|||||||
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
||||||
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
|
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
|
||||||
|
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
|
||||||
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
|
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
|
||||||
|
if embed_shape == 729:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
|
||||||
elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
|
elif embed_shape == 1024:
|
||||||
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
|
||||||
|
elif embed_shape == 577:
|
||||||
if "multi_modal_projector.linear_1.bias" in sd:
|
if "multi_modal_projector.linear_1.bias" in sd:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
|
||||||
else:
|
else:
|
||||||
|
13
comfy/clip_vision_siglip_512.json
Normal file
13
comfy/clip_vision_siglip_512.json
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"num_channels": 3,
|
||||||
|
"hidden_act": "gelu_pytorch_tanh",
|
||||||
|
"hidden_size": 1152,
|
||||||
|
"image_size": 512,
|
||||||
|
"intermediate_size": 4304,
|
||||||
|
"model_type": "siglip_vision_model",
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_hidden_layers": 27,
|
||||||
|
"patch_size": 16,
|
||||||
|
"image_mean": [0.5, 0.5, 0.5],
|
||||||
|
"image_std": [0.5, 0.5, 0.5]
|
||||||
|
}
|
@ -847,6 +847,7 @@ class SpatialTransformer(nn.Module):
|
|||||||
if not isinstance(context, list):
|
if not isinstance(context, list):
|
||||||
context = [context] * len(self.transformer_blocks)
|
context = [context] * len(self.transformer_blocks)
|
||||||
b, c, h, w = x.shape
|
b, c, h, w = x.shape
|
||||||
|
transformer_options["activations_shape"] = list(x.shape)
|
||||||
x_in = x
|
x_in = x
|
||||||
x = self.norm(x)
|
x = self.norm(x)
|
||||||
if not self.use_linear:
|
if not self.use_linear:
|
||||||
@ -962,6 +963,7 @@ class SpatialVideoTransformer(SpatialTransformer):
|
|||||||
transformer_options={}
|
transformer_options={}
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
_, _, h, w = x.shape
|
_, _, h, w = x.shape
|
||||||
|
transformer_options["activations_shape"] = list(x.shape)
|
||||||
x_in = x
|
x_in = x
|
||||||
spatial_context = None
|
spatial_context = None
|
||||||
if exists(context):
|
if exists(context):
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import torch
|
import torch
|
||||||
|
import comfy.utils
|
||||||
|
|
||||||
|
|
||||||
def convert_lora_bfl_control(sd): #BFL loras for Flux
|
def convert_lora_bfl_control(sd): #BFL loras for Flux
|
||||||
@ -11,7 +12,13 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
|
|||||||
return sd_out
|
return sd_out
|
||||||
|
|
||||||
|
|
||||||
|
def convert_lora_wan_fun(sd): #Wan Fun loras
|
||||||
|
return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
|
||||||
|
|
||||||
|
|
||||||
def convert_lora(sd):
|
def convert_lora(sd):
|
||||||
if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
|
if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
|
||||||
return convert_lora_bfl_control(sd)
|
return convert_lora_bfl_control(sd)
|
||||||
|
if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
|
||||||
|
return convert_lora_wan_fun(sd)
|
||||||
return sd
|
return sd
|
||||||
|
@ -1000,7 +1000,6 @@ class WAN21(BaseModel):
|
|||||||
device = kwargs["device"]
|
device = kwargs["device"]
|
||||||
|
|
||||||
if image is None:
|
if image is None:
|
||||||
image = torch.zeros_like(noise)
|
|
||||||
shape_image = list(noise.shape)
|
shape_image = list(noise.shape)
|
||||||
shape_image[1] = extra_channels
|
shape_image[1] = extra_channels
|
||||||
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||||
@ -1013,6 +1012,9 @@ class WAN21(BaseModel):
|
|||||||
if not self.image_to_video or extra_channels == image.shape[1]:
|
if not self.image_to_video or extra_channels == image.shape[1]:
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
if image.shape[1] > (extra_channels - 4):
|
||||||
|
image = image[:, :(extra_channels - 4)]
|
||||||
|
|
||||||
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||||
if mask is None:
|
if mask is None:
|
||||||
mask = torch.zeros_like(noise)[:, :4]
|
mask = torch.zeros_like(noise)[:, :4]
|
||||||
|
@ -851,6 +851,8 @@ def text_encoder_dtype(device=None):
|
|||||||
return torch.float8_e5m2
|
return torch.float8_e5m2
|
||||||
elif args.fp16_text_enc:
|
elif args.fp16_text_enc:
|
||||||
return torch.float16
|
return torch.float16
|
||||||
|
elif args.bf16_text_enc:
|
||||||
|
return torch.bfloat16
|
||||||
elif args.fp32_text_enc:
|
elif args.fp32_text_enc:
|
||||||
return torch.float32
|
return torch.float32
|
||||||
|
|
||||||
@ -1263,6 +1265,8 @@ def soft_empty_cache(force=False):
|
|||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
elif is_ascend_npu():
|
elif is_ascend_npu():
|
||||||
torch.npu.empty_cache()
|
torch.npu.empty_cache()
|
||||||
|
elif is_mlu():
|
||||||
|
torch.mlu.empty_cache()
|
||||||
elif torch.cuda.is_available():
|
elif torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch.cuda.ipc_collect()
|
torch.cuda.ipc_collect()
|
||||||
|
10
comfy/sd.py
10
comfy/sd.py
@ -265,6 +265,7 @@ class VAE:
|
|||||||
self.process_input = lambda image: image * 2.0 - 1.0
|
self.process_input = lambda image: image * 2.0 - 1.0
|
||||||
self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
|
self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
|
||||||
self.working_dtypes = [torch.bfloat16, torch.float32]
|
self.working_dtypes = [torch.bfloat16, torch.float32]
|
||||||
|
self.disable_offload = False
|
||||||
|
|
||||||
self.downscale_index_formula = None
|
self.downscale_index_formula = None
|
||||||
self.upscale_index_formula = None
|
self.upscale_index_formula = None
|
||||||
@ -337,6 +338,7 @@ class VAE:
|
|||||||
self.process_output = lambda audio: audio
|
self.process_output = lambda audio: audio
|
||||||
self.process_input = lambda audio: audio
|
self.process_input = lambda audio: audio
|
||||||
self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||||
|
self.disable_offload = True
|
||||||
elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
|
elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
|
||||||
if "blocks.2.blocks.3.stack.5.weight" in sd:
|
if "blocks.2.blocks.3.stack.5.weight" in sd:
|
||||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
|
sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
|
||||||
@ -515,7 +517,7 @@ class VAE:
|
|||||||
pixel_samples = None
|
pixel_samples = None
|
||||||
try:
|
try:
|
||||||
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
|
memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
|
||||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used)
|
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||||
free_memory = model_management.get_free_memory(self.device)
|
free_memory = model_management.get_free_memory(self.device)
|
||||||
batch_number = int(free_memory / memory_used)
|
batch_number = int(free_memory / memory_used)
|
||||||
batch_number = max(1, batch_number)
|
batch_number = max(1, batch_number)
|
||||||
@ -544,7 +546,7 @@ class VAE:
|
|||||||
def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
|
def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
|
||||||
self.throw_exception_if_invalid()
|
self.throw_exception_if_invalid()
|
||||||
memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
|
memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
|
||||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used)
|
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||||
dims = samples.ndim - 2
|
dims = samples.ndim - 2
|
||||||
args = {}
|
args = {}
|
||||||
if tile_x is not None:
|
if tile_x is not None:
|
||||||
@ -578,7 +580,7 @@ class VAE:
|
|||||||
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
||||||
try:
|
try:
|
||||||
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
|
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
|
||||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used)
|
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||||
free_memory = model_management.get_free_memory(self.device)
|
free_memory = model_management.get_free_memory(self.device)
|
||||||
batch_number = int(free_memory / max(1, memory_used))
|
batch_number = int(free_memory / max(1, memory_used))
|
||||||
batch_number = max(1, batch_number)
|
batch_number = max(1, batch_number)
|
||||||
@ -612,7 +614,7 @@ class VAE:
|
|||||||
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
||||||
|
|
||||||
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile
|
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile
|
||||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used)
|
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||||
|
|
||||||
args = {}
|
args = {}
|
||||||
if tile_x is not None:
|
if tile_x is not None:
|
||||||
|
@ -244,7 +244,7 @@ def save_glb(vertices, faces, filepath, metadata=None):
|
|||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
|
vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
|
||||||
faces: torch.Tensor of shape (M, 4) or (M, 3) - The face indices (quad or triangle faces)
|
faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces)
|
||||||
filepath: str - Output filepath (should end with .glb)
|
filepath: str - Output filepath (should end with .glb)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -446,7 +446,6 @@ class LTXVPreprocess:
|
|||||||
CATEGORY = "image"
|
CATEGORY = "image"
|
||||||
|
|
||||||
def preprocess(self, image, img_compression):
|
def preprocess(self, image, img_compression):
|
||||||
if img_compression > 0:
|
|
||||||
output_images = []
|
output_images = []
|
||||||
for i in range(image.shape[0]):
|
for i in range(image.shape[0]):
|
||||||
output_images.append(preprocess(image[i], img_compression))
|
output_images.append(preprocess(image[i], img_compression))
|
||||||
|
@ -2,6 +2,7 @@ import numpy as np
|
|||||||
import scipy.ndimage
|
import scipy.ndimage
|
||||||
import torch
|
import torch
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
|
import node_helpers
|
||||||
|
|
||||||
from nodes import MAX_RESOLUTION
|
from nodes import MAX_RESOLUTION
|
||||||
|
|
||||||
@ -87,6 +88,7 @@ class ImageCompositeMasked:
|
|||||||
CATEGORY = "image"
|
CATEGORY = "image"
|
||||||
|
|
||||||
def composite(self, destination, source, x, y, resize_source, mask = None):
|
def composite(self, destination, source, x, y, resize_source, mask = None):
|
||||||
|
destination, source = node_helpers.image_alpha_fix(destination, source)
|
||||||
destination = destination.clone().movedim(-1, 1)
|
destination = destination.clone().movedim(-1, 1)
|
||||||
output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)
|
output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)
|
||||||
return (output,)
|
return (output,)
|
||||||
|
@ -6,7 +6,7 @@ import math
|
|||||||
|
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
|
import node_helpers
|
||||||
|
|
||||||
class Blend:
|
class Blend:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -34,6 +34,7 @@ class Blend:
|
|||||||
CATEGORY = "image/postprocessing"
|
CATEGORY = "image/postprocessing"
|
||||||
|
|
||||||
def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str):
|
def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str):
|
||||||
|
image1, image2 = node_helpers.image_alpha_fix(image1, image2)
|
||||||
image2 = image2.to(image1.device)
|
image2 = image2.to(image1.device)
|
||||||
if image1.shape != image2.shape:
|
if image1.shape != image2.shape:
|
||||||
image2 = image2.permute(0, 3, 1, 2)
|
image2 = image2.permute(0, 3, 1, 2)
|
||||||
|
@ -44,3 +44,11 @@ def string_to_torch_dtype(string):
|
|||||||
return torch.float16
|
return torch.float16
|
||||||
if string == "bf16":
|
if string == "bf16":
|
||||||
return torch.bfloat16
|
return torch.bfloat16
|
||||||
|
|
||||||
|
def image_alpha_fix(destination, source):
|
||||||
|
if destination.shape[-1] < source.shape[-1]:
|
||||||
|
source = source[...,:destination.shape[-1]]
|
||||||
|
elif destination.shape[-1] > source.shape[-1]:
|
||||||
|
destination = torch.nn.functional.pad(destination, (0, 1))
|
||||||
|
destination[..., -1] = 1.0
|
||||||
|
return destination, source
|
||||||
|
4
nodes.py
4
nodes.py
@ -786,6 +786,8 @@ class ControlNetLoader:
|
|||||||
def load_controlnet(self, control_net_name):
|
def load_controlnet(self, control_net_name):
|
||||||
controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name)
|
controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name)
|
||||||
controlnet = comfy.controlnet.load_controlnet(controlnet_path)
|
controlnet = comfy.controlnet.load_controlnet(controlnet_path)
|
||||||
|
if controlnet is None:
|
||||||
|
raise RuntimeError("ERROR: controlnet file is invalid and does not contain a valid controlnet model.")
|
||||||
return (controlnet,)
|
return (controlnet,)
|
||||||
|
|
||||||
class DiffControlNetLoader:
|
class DiffControlNetLoader:
|
||||||
@ -1006,6 +1008,8 @@ class CLIPVisionLoader:
|
|||||||
def load_clip(self, clip_name):
|
def load_clip(self, clip_name):
|
||||||
clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name)
|
clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name)
|
||||||
clip_vision = comfy.clip_vision.load(clip_path)
|
clip_vision = comfy.clip_vision.load(clip_path)
|
||||||
|
if clip_vision is None:
|
||||||
|
raise RuntimeError("ERROR: clip vision file is invalid and does not contain a valid vision model.")
|
||||||
return (clip_vision,)
|
return (clip_vision,)
|
||||||
|
|
||||||
class CLIPVisionEncode:
|
class CLIPVisionEncode:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
comfyui-frontend-package==1.14.5
|
comfyui-frontend-package==1.14.6
|
||||||
torch
|
torch
|
||||||
torchsde
|
torchsde
|
||||||
torchvision
|
torchvision
|
||||||
|
Loading…
x
Reference in New Issue
Block a user