diff --git a/comfy_api/v3/io.py b/comfy_api/v3/io.py index c15d77542..2307f9b65 100644 --- a/comfy_api/v3/io.py +++ b/comfy_api/v3/io.py @@ -439,6 +439,12 @@ class MultiCombo(ComfyTypeI): class Image(ComfyTypeIO): Type = torch.Tensor + +@comfytype(io_type="WAN_CAMERA_EMBEDDING") +class WanCameraEmbedding(ComfyTypeIO): + Type = torch.Tensor + + @comfytype(io_type="WEBCAM") class Webcam(ComfyTypeIO): Type = str diff --git a/comfy_extras/v3/nodes_camera_trajectory.py b/comfy_extras/v3/nodes_camera_trajectory.py new file mode 100644 index 000000000..2891d9854 --- /dev/null +++ b/comfy_extras/v3/nodes_camera_trajectory.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import numpy as np +import torch +from einops import rearrange + +import comfy.model_management +import nodes +from comfy_api.v3 import io + +CAMERA_DICT = { + "base_T_norm": 1.5, + "base_angle": np.pi / 3, + "Static": {"angle": [0.0, 0.0, 0.0], "T": [0.0, 0.0, 0.0]}, + "Pan Up": {"angle": [0.0, 0.0, 0.0], "T": [0.0, -1.0, 0.0]}, + "Pan Down": {"angle": [0.0, 0.0, 0.0], "T": [0.0, 1.0, 0.0]}, + "Pan Left": {"angle": [0.0, 0.0, 0.0], "T": [-1.0, 0.0, 0.0]}, + "Pan Right": {"angle": [0.0, 0.0, 0.0], "T": [1.0, 0.0, 0.0]}, + "Zoom In": {"angle": [0.0, 0.0, 0.0], "T": [0.0, 0.0, 2.0]}, + "Zoom Out": {"angle": [0.0, 0.0, 0.0], "T": [0.0, 0.0, -2.0]}, + "Anti Clockwise (ACW)": {"angle": [0.0, 0.0, -1.0], "T": [0.0, 0.0, 0.0]}, + "ClockWise (CW)": {"angle": [0.0, 0.0, 1.0], "T": [0.0, 0.0, 0.0]}, +} + + +def process_pose_params(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device="cpu"): + def get_relative_pose(cam_params): + """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py""" + abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params] + abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params] + cam_to_origin = 0 + target_cam_c2w = np.array([[1, 0, 0, 0], [0, 1, 0, -cam_to_origin], [0, 0, 1, 0], [0, 0, 0, 1]]) + abs2rel = target_cam_c2w @ abs_w2cs[0] + ret_poses = [target_cam_c2w] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]] + return np.array(ret_poses, dtype=np.float32) + + """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py""" + cam_params = [Camera(cam_param) for cam_param in cam_params] + + sample_wh_ratio = width / height + pose_wh_ratio = original_pose_width / original_pose_height # Assuming placeholder ratios, change as needed + + if pose_wh_ratio > sample_wh_ratio: + resized_ori_w = height * pose_wh_ratio + for cam_param in cam_params: + cam_param.fx = resized_ori_w * cam_param.fx / width + else: + resized_ori_h = width / pose_wh_ratio + for cam_param in cam_params: + cam_param.fy = resized_ori_h * cam_param.fy / height + + intrinsic = np.asarray( + [[cam_param.fx * width, cam_param.fy * height, cam_param.cx * width, cam_param.cy * height] for cam_param in cam_params], + dtype=np.float32, + ) + + K = torch.as_tensor(intrinsic)[None] # [1, 1, 4] + c2ws = get_relative_pose(cam_params) # Assuming this function is defined elsewhere + c2ws = torch.as_tensor(c2ws)[None] # [1, n_frame, 4, 4] + plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous() # V, 6, H, W + plucker_embedding = plucker_embedding[None] + return rearrange(plucker_embedding, "b f c h w -> b f h w c")[0] + + +class Camera: + """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py""" + + def __init__(self, entry): + fx, fy, cx, cy = entry[1:5] + self.fx = fx + self.fy = fy + self.cx = cx + self.cy = cy + c2w_mat = np.array(entry[7:]).reshape(4, 4) + self.c2w_mat = c2w_mat + self.w2c_mat = np.linalg.inv(c2w_mat) + + +def ray_condition(K, c2w, H, W, device): + """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py""" + # c2w: B, V, 4, 4 + # K: B, V, 4 + + B = K.shape[0] + + j, i = torch.meshgrid( + torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype), + torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype), + indexing="ij", + ) + i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5 # [B, HxW] + j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5 # [B, HxW] + + fx, fy, cx, cy = K.chunk(4, dim=-1) # B,V, 1 + + zs = torch.ones_like(i) # [B, HxW] + xs = (i - cx) / fx * zs + ys = (j - cy) / fy * zs + zs = zs.expand_as(ys) + + directions = torch.stack((xs, ys, zs), dim=-1) # B, V, HW, 3 + directions = directions / directions.norm(dim=-1, keepdim=True) # B, V, HW, 3 + + rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2) # B, V, 3, HW + rays_o = c2w[..., :3, 3] # B, V, 3 + rays_o = rays_o[:, :, None].expand_as(rays_d) # B, V, 3, HW + # c2w @ dirctions + rays_dxo = torch.cross(rays_o, rays_d) + plucker = torch.cat([rays_dxo, rays_d], dim=-1) + plucker = plucker.reshape(B, c2w.shape[1], H, W, 6) # B, V, H, W, 6 + # plucker = plucker.permute(0, 1, 4, 2, 3) + return plucker + + +def get_camera_motion(angle, T, speed, n=81): + def compute_R_form_rad_angle(angles): + theta_x, theta_y, theta_z = angles + Rx = np.array([[1, 0, 0], [0, np.cos(theta_x), -np.sin(theta_x)], [0, np.sin(theta_x), np.cos(theta_x)]]) + + Ry = np.array([[np.cos(theta_y), 0, np.sin(theta_y)], [0, 1, 0], [-np.sin(theta_y), 0, np.cos(theta_y)]]) + + Rz = np.array([[np.cos(theta_z), -np.sin(theta_z), 0], [np.sin(theta_z), np.cos(theta_z), 0], [0, 0, 1]]) + + R = np.dot(Rz, np.dot(Ry, Rx)) + return R + + RT = [] + for i in range(n): + _angle = (i / n) * speed * (CAMERA_DICT["base_angle"]) * angle + R = compute_R_form_rad_angle(_angle) + _T = (i / n) * speed * (CAMERA_DICT["base_T_norm"]) * (T.reshape(3, 1)) + _RT = np.concatenate([R, _T], axis=1) + RT.append(_RT) + RT = np.stack(RT) + return RT + + +class WanCameraEmbedding(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanCameraEmbedding_V3", + category="camera", + inputs=[ + io.Combo.Input( + "camera_pose", + options=[ + "Static", + "Pan Up", + "Pan Down", + "Pan Left", + "Pan Right", + "Zoom In", + "Zoom Out", + "Anti Clockwise (ACW)", + "ClockWise (CW)", + ], + default="Static", + ), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Float.Input("speed", default=1.0, min=0, max=10.0, step=0.1, optional=True), + io.Float.Input("fx", default=0.5, min=0, max=1, step=0.000000001, optional=True), + io.Float.Input("fy", default=0.5, min=0, max=1, step=0.000000001, optional=True), + io.Float.Input("cx", default=0.5, min=0, max=1, step=0.01, optional=True), + io.Float.Input("cy", default=0.5, min=0, max=1, step=0.01, optional=True), + ], + outputs=[ + io.WanCameraEmbedding.Output(display_name="camera_embedding"), + io.Int.Output(display_name="width"), + io.Int.Output(display_name="height"), + io.Int.Output(display_name="length"), + ], + ) + + @classmethod + def execute(cls, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5) -> io.NodeOutput: + """ + Use Camera trajectory as extrinsic parameters to calculate Plücker embeddings (Sitzmannet al., 2021) + Adapted from https://github.com/aigc-apps/VideoX-Fun/blob/main/comfyui/comfyui_nodes.py + """ + motion_list = [camera_pose] + speed = speed + angle = np.array(CAMERA_DICT[motion_list[0]]["angle"]) + T = np.array(CAMERA_DICT[motion_list[0]]["T"]) + RT = get_camera_motion(angle, T, speed, length) + + trajs = [] + for cp in RT.tolist(): + traj = [fx, fy, cx, cy, 0, 0] + traj.extend(cp[0]) + traj.extend(cp[1]) + traj.extend(cp[2]) + traj.extend([0, 0, 0, 1]) + trajs.append(traj) + + cam_params = np.array([[float(x) for x in pose] for pose in trajs]) + cam_params = np.concatenate([np.zeros_like(cam_params[:, :1]), cam_params], 1) + control_camera_video = process_pose_params(cam_params, width=width, height=height) + control_camera_video = control_camera_video.permute([3, 0, 1, 2]).unsqueeze(0).to(device=comfy.model_management.intermediate_device()) + + control_camera_video = torch.concat( + [torch.repeat_interleave(control_camera_video[:, :, 0:1], repeats=4, dim=2), control_camera_video[:, :, 1:]], dim=2 + ).transpose(1, 2) + + # Reshape, transpose, and view into desired shape + b, f, c, h, w = control_camera_video.shape + control_camera_video = control_camera_video.contiguous().view(b, f // 4, 4, c, h, w).transpose(2, 3) + control_camera_video = control_camera_video.contiguous().view(b, f // 4, c * 4, h, w).transpose(1, 2) + + return io.NodeOutput(control_camera_video, width, height, length) + + +NODES_LIST = [ + WanCameraEmbedding, +] diff --git a/comfy_extras/v3/nodes_canny.py b/comfy_extras/v3/nodes_canny.py new file mode 100644 index 000000000..80f14e7c8 --- /dev/null +++ b/comfy_extras/v3/nodes_canny.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from kornia.filters import canny + +import comfy.model_management +from comfy_api.v3 import io + + +class Canny(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="Canny_V3", + category="image/preprocessors", + inputs=[ + io.Image.Input("image"), + io.Float.Input("low_threshold", default=0.4, min=0.01, max=0.99, step=0.01), + io.Float.Input("high_threshold", default=0.8, min=0.01, max=0.99, step=0.01), + ], + outputs=[io.Image.Output()], + ) + + @classmethod + def execute(cls, image, low_threshold, high_threshold) -> io.NodeOutput: + output = canny(image.to(comfy.model_management.get_torch_device()).movedim(-1, 1), low_threshold, high_threshold) + img_out = output[1].to(comfy.model_management.intermediate_device()).repeat(1, 3, 1, 1).movedim(1, -1) + return io.NodeOutput(img_out) + + +NODES_LIST = [ + Canny, +] diff --git a/comfy_extras/v3/nodes_cfg.py b/comfy_extras/v3/nodes_cfg.py new file mode 100644 index 000000000..5c06c271a --- /dev/null +++ b/comfy_extras/v3/nodes_cfg.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import torch + +from comfy_api.v3 import io + + +def optimized_scale(positive, negative): + positive_flat = positive.reshape(positive.shape[0], -1) + negative_flat = negative.reshape(negative.shape[0], -1) + + # Calculate dot production + dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True) + + # Squared norm of uncondition + squared_norm = torch.sum(negative_flat ** 2, dim=1, keepdim=True) + 1e-8 + + # st_star = v_cond^T * v_uncond / ||v_uncond||^2 + st_star = dot_product / squared_norm + + return st_star.reshape([positive.shape[0]] + [1] * (positive.ndim - 1)) + + +class CFGNorm(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="CFGNorm_V3", + category="advanced/guidance", + inputs=[ + io.Model.Input("model"), + io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[io.Model.Output("patched_model", display_name="patched_model")], + ) + + @classmethod + def execute(cls, model, strength) -> io.NodeOutput: + m = model.clone() + + def cfg_norm(args): + cond_p = args['cond_denoised'] + pred_text_ = args["denoised"] + + norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) + norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) + scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) + return pred_text_ * scale * strength + + m.set_model_sampler_post_cfg_function(cfg_norm) + return io.NodeOutput(m) + + +class CFGZeroStar(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="CFGZeroStar_V3", + category="advanced/guidance", + inputs=[ + io.Model.Input("model"), + ], + outputs=[io.Model.Output("patched_model", display_name="patched_model")], + ) + + @classmethod + def execute(cls, model) -> io.NodeOutput: + m = model.clone() + + def cfg_zero_star(args): + guidance_scale = args['cond_scale'] + x = args['input'] + cond_p = args['cond_denoised'] + uncond_p = args['uncond_denoised'] + out = args["denoised"] + alpha = optimized_scale(x - cond_p, x - uncond_p) + + return out + uncond_p * (alpha - 1.0) + guidance_scale * uncond_p * (1.0 - alpha) + + m.set_model_sampler_post_cfg_function(cfg_zero_star) + return io.NodeOutput(m) + + +NODES_LIST = [ + CFGNorm, + CFGZeroStar, +] diff --git a/comfy_extras/v3/nodes_clip_sdxl.py b/comfy_extras/v3/nodes_clip_sdxl.py new file mode 100644 index 000000000..849ee85da --- /dev/null +++ b/comfy_extras/v3/nodes_clip_sdxl.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import nodes +from comfy_api.v3 import io + + +class CLIPTextEncodeSDXL(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="CLIPTextEncodeSDXL_V3", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("crop_w", default=0, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("crop_h", default=0, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("target_width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("target_height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.String.Input("text_g", multiline=True, dynamic_prompts=True), + io.String.Input("text_l", multiline=True, dynamic_prompts=True), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, clip, width, height, crop_w, crop_h, target_width, target_height, text_g, text_l) -> io.NodeOutput: + tokens = clip.tokenize(text_g) + tokens["l"] = clip.tokenize(text_l)["l"] + if len(tokens["l"]) != len(tokens["g"]): + empty = clip.tokenize("") + while len(tokens["l"]) < len(tokens["g"]): + tokens["l"] += empty["l"] + while len(tokens["l"]) > len(tokens["g"]): + tokens["g"] += empty["g"] + conditioning = clip.encode_from_tokens_scheduled( + tokens, + add_dict={ + "width": width, + "height": height, + "crop_w": crop_w, + "crop_h": crop_h, + "target_width": target_width, + "target_height": target_height, + }, + ) + return io.NodeOutput(conditioning) + + +class CLIPTextEncodeSDXLRefiner(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="CLIPTextEncodeSDXLRefiner_V3", + category="advanced/conditioning", + inputs=[ + io.Float.Input("ascore", default=6.0, min=0.0, max=1000.0, step=0.01), + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.String.Input("text", multiline=True, dynamic_prompts=True), + io.Clip.Input("clip"), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, ascore, width, height, text, clip) -> io.NodeOutput: + tokens = clip.tokenize(text) + conditioning = clip.encode_from_tokens_scheduled( + tokens, add_dict={"aesthetic_score": ascore, "width": width, "height": height} + ) + return io.NodeOutput(conditioning) + + +NODES_LIST = [ + CLIPTextEncodeSDXL, + CLIPTextEncodeSDXLRefiner, +] diff --git a/comfy_extras/v3/nodes_compositing.py b/comfy_extras/v3/nodes_compositing.py new file mode 100644 index 000000000..934530bcf --- /dev/null +++ b/comfy_extras/v3/nodes_compositing.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +from enum import Enum + +import torch + +import comfy.utils +from comfy_api.v3 import io + + +def resize_mask(mask, shape): + return torch.nn.functional.interpolate( + mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[0], shape[1]), mode="bilinear" + ).squeeze(1) + + +class PorterDuffMode(Enum): + ADD = 0 + CLEAR = 1 + DARKEN = 2 + DST = 3 + DST_ATOP = 4 + DST_IN = 5 + DST_OUT = 6 + DST_OVER = 7 + LIGHTEN = 8 + MULTIPLY = 9 + OVERLAY = 10 + SCREEN = 11 + SRC = 12 + SRC_ATOP = 13 + SRC_IN = 14 + SRC_OUT = 15 + SRC_OVER = 16 + XOR = 17 + + +def porter_duff_composite( + src_image: torch.Tensor, src_alpha: torch.Tensor, dst_image: torch.Tensor, dst_alpha: torch.Tensor, mode: PorterDuffMode +): + # convert mask to alpha + src_alpha = 1 - src_alpha + dst_alpha = 1 - dst_alpha + # premultiply alpha + src_image = src_image * src_alpha + dst_image = dst_image * dst_alpha + + # composite ops below assume alpha-premultiplied images + if mode == PorterDuffMode.ADD: + out_alpha = torch.clamp(src_alpha + dst_alpha, 0, 1) + out_image = torch.clamp(src_image + dst_image, 0, 1) + elif mode == PorterDuffMode.CLEAR: + out_alpha = torch.zeros_like(dst_alpha) + out_image = torch.zeros_like(dst_image) + elif mode == PorterDuffMode.DARKEN: + out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha + out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image + torch.min(src_image, dst_image) + elif mode == PorterDuffMode.DST: + out_alpha = dst_alpha + out_image = dst_image + elif mode == PorterDuffMode.DST_ATOP: + out_alpha = src_alpha + out_image = src_alpha * dst_image + (1 - dst_alpha) * src_image + elif mode == PorterDuffMode.DST_IN: + out_alpha = src_alpha * dst_alpha + out_image = dst_image * src_alpha + elif mode == PorterDuffMode.DST_OUT: + out_alpha = (1 - src_alpha) * dst_alpha + out_image = (1 - src_alpha) * dst_image + elif mode == PorterDuffMode.DST_OVER: + out_alpha = dst_alpha + (1 - dst_alpha) * src_alpha + out_image = dst_image + (1 - dst_alpha) * src_image + elif mode == PorterDuffMode.LIGHTEN: + out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha + out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image + torch.max(src_image, dst_image) + elif mode == PorterDuffMode.MULTIPLY: + out_alpha = src_alpha * dst_alpha + out_image = src_image * dst_image + elif mode == PorterDuffMode.OVERLAY: + out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha + out_image = torch.where(2 * dst_image < dst_alpha, 2 * src_image * dst_image, + src_alpha * dst_alpha - 2 * (dst_alpha - src_image) * (src_alpha - dst_image)) + elif mode == PorterDuffMode.SCREEN: + out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha + out_image = src_image + dst_image - src_image * dst_image + elif mode == PorterDuffMode.SRC: + out_alpha = src_alpha + out_image = src_image + elif mode == PorterDuffMode.SRC_ATOP: + out_alpha = dst_alpha + out_image = dst_alpha * src_image + (1 - src_alpha) * dst_image + elif mode == PorterDuffMode.SRC_IN: + out_alpha = src_alpha * dst_alpha + out_image = src_image * dst_alpha + elif mode == PorterDuffMode.SRC_OUT: + out_alpha = (1 - dst_alpha) * src_alpha + out_image = (1 - dst_alpha) * src_image + elif mode == PorterDuffMode.SRC_OVER: + out_alpha = src_alpha + (1 - src_alpha) * dst_alpha + out_image = src_image + (1 - src_alpha) * dst_image + elif mode == PorterDuffMode.XOR: + out_alpha = (1 - dst_alpha) * src_alpha + (1 - src_alpha) * dst_alpha + out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image + else: + return None, None + + # back to non-premultiplied alpha + out_image = torch.where(out_alpha > 1e-5, out_image / out_alpha, torch.zeros_like(out_image)) + out_image = torch.clamp(out_image, 0, 1) + # convert alpha to mask + out_alpha = 1 - out_alpha + return out_image, out_alpha + + +class JoinImageWithAlpha(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="JoinImageWithAlpha_V3", + display_name="Join Image with Alpha _V3", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + io.Mask.Input("alpha"), + ], + outputs=[io.Image.Output()], + ) + + @classmethod + def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput: + batch_size = min(len(image), len(alpha)) + out_images = [] + + alpha = 1.0 - resize_mask(alpha, image.shape[1:]) + for i in range(batch_size): + out_images.append(torch.cat((image[i][:, :, :3], alpha[i].unsqueeze(2)), dim=2)) + + return io.NodeOutput(torch.stack(out_images)) + + +class PorterDuffImageComposite(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="PorterDuffImageComposite_V3", + display_name="Porter-Duff Image Composite _V3", + category="mask/compositing", + inputs=[ + io.Image.Input("source"), + io.Mask.Input("source_alpha"), + io.Image.Input("destination"), + io.Mask.Input("destination_alpha"), + io.Combo.Input("mode", options=[mode.name for mode in PorterDuffMode], default=PorterDuffMode.DST.name), + ], + outputs=[io.Image.Output(), io.Mask.Output()], + ) + + @classmethod + def execute( + cls, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode + ) -> io.NodeOutput: + batch_size = min(len(source), len(source_alpha), len(destination), len(destination_alpha)) + out_images = [] + out_alphas = [] + + for i in range(batch_size): + src_image = source[i] + dst_image = destination[i] + + assert src_image.shape[2] == dst_image.shape[2] # inputs need to have same number of channels + + src_alpha = source_alpha[i].unsqueeze(2) + dst_alpha = destination_alpha[i].unsqueeze(2) + + if dst_alpha.shape[:2] != dst_image.shape[:2]: + upscale_input = dst_alpha.unsqueeze(0).permute(0, 3, 1, 2) + upscale_output = comfy.utils.common_upscale( + upscale_input, dst_image.shape[1], dst_image.shape[0], upscale_method='bicubic', crop='center' + ) + dst_alpha = upscale_output.permute(0, 2, 3, 1).squeeze(0) + if src_image.shape != dst_image.shape: + upscale_input = src_image.unsqueeze(0).permute(0, 3, 1, 2) + upscale_output = comfy.utils.common_upscale( + upscale_input, dst_image.shape[1], dst_image.shape[0], upscale_method='bicubic', crop='center' + ) + src_image = upscale_output.permute(0, 2, 3, 1).squeeze(0) + if src_alpha.shape != dst_alpha.shape: + upscale_input = src_alpha.unsqueeze(0).permute(0, 3, 1, 2) + upscale_output = comfy.utils.common_upscale( + upscale_input, dst_alpha.shape[1], dst_alpha.shape[0], upscale_method='bicubic', crop='center' + ) + src_alpha = upscale_output.permute(0, 2, 3, 1).squeeze(0) + + out_image, out_alpha = porter_duff_composite(src_image, src_alpha, dst_image, dst_alpha, PorterDuffMode[mode]) + + out_images.append(out_image) + out_alphas.append(out_alpha.squeeze(2)) + + return io.NodeOutput(torch.stack(out_images), torch.stack(out_alphas)) + + +class SplitImageWithAlpha(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="SplitImageWithAlpha_V3", + display_name="Split Image with Alpha _V3", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + ], + outputs=[io.Image.Output(), io.Mask.Output()], + ) + + @classmethod + def execute(cls, image: torch.Tensor) -> io.NodeOutput: + out_images = [i[:, :, :3] for i in image] + out_alphas = [i[:, :, 3] if i.shape[2] > 3 else torch.ones_like(i[:, :, 0]) for i in image] + return io.NodeOutput(torch.stack(out_images), 1.0 - torch.stack(out_alphas)) + + +NODES_LIST = [ + JoinImageWithAlpha, + PorterDuffImageComposite, + SplitImageWithAlpha, +] diff --git a/comfy_extras/v3/nodes_cond.py b/comfy_extras/v3/nodes_cond.py new file mode 100644 index 000000000..9e7d2e477 --- /dev/null +++ b/comfy_extras/v3/nodes_cond.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from comfy_api.v3 import io + + +class CLIPTextEncodeControlnet(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="CLIPTextEncodeControlnet_V3", + category="_for_testing/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Conditioning.Input("conditioning"), + io.String.Input("text", multiline=True, dynamic_prompts=True), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, clip, conditioning, text) -> io.NodeOutput: + tokens = clip.tokenize(text) + cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True) + c = [] + for t in conditioning: + n = [t[0], t[1].copy()] + n[1]['cross_attn_controlnet'] = cond + n[1]['pooled_output_controlnet'] = pooled + c.append(n) + return io.NodeOutput(c) + + +class T5TokenizerOptions(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="T5TokenizerOptions_V3", + category="_for_testing/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Int.Input("min_padding", default=0, min=0, max=10000, step=1), + io.Int.Input("min_length", default=0, min=0, max=10000, step=1), + ], + outputs=[io.Clip.Output()], + ) + + @classmethod + def execute(cls, clip, min_padding, min_length) -> io.NodeOutput: + clip = clip.clone() + for t5_type in ["t5xxl", "pile_t5xl", "t5base", "mt5xl", "umt5xxl"]: + clip.set_tokenizer_option("{}_min_padding".format(t5_type), min_padding) + clip.set_tokenizer_option("{}_min_length".format(t5_type), min_length) + + return io.NodeOutput(clip) + + +NODES_LIST = [ + CLIPTextEncodeControlnet, + T5TokenizerOptions, +] diff --git a/comfy_extras/v3/nodes_cosmos.py b/comfy_extras/v3/nodes_cosmos.py new file mode 100644 index 000000000..0659390c7 --- /dev/null +++ b/comfy_extras/v3/nodes_cosmos.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import torch + +import comfy.latent_formats +import comfy.model_management +import comfy.utils +import nodes +from comfy_api.v3 import io + + +def vae_encode_with_padding(vae, image, width, height, length, padding=0): + pixels = comfy.utils.common_upscale(image[..., :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + pixel_len = min(pixels.shape[0], length) + padded_length = min(length, (((pixel_len - 1) // 8) + 1 + padding) * 8 - 7) + padded_pixels = torch.ones((padded_length, height, width, 3)) * 0.5 + padded_pixels[:pixel_len] = pixels[:pixel_len] + latent_len = ((pixel_len - 1) // 8) + 1 + latent_temp = vae.encode(padded_pixels) + return latent_temp[:, :, :latent_len] + + +class CosmosImageToVideoLatent(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="CosmosImageToVideoLatent_V3", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[io.Latent.Output()], + ) + + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput: + latent = torch.zeros([1, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + if start_image is None and end_image is None: + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(out_latent) + + mask = torch.ones( + [latent.shape[0], 1, ((length - 1) // 8) + 1, latent.shape[-2], latent.shape[-1]], + device=comfy.model_management.intermediate_device(), + ) + + if start_image is not None: + latent_temp = vae_encode_with_padding(vae, start_image, width, height, length, padding=1) + latent[:, :, :latent_temp.shape[-3]] = latent_temp + mask[:, :, :latent_temp.shape[-3]] *= 0.0 + + if end_image is not None: + latent_temp = vae_encode_with_padding(vae, end_image, width, height, length, padding=0) + latent[:, :, -latent_temp.shape[-3]:] = latent_temp + mask[:, :, -latent_temp.shape[-3]:] *= 0.0 + + out_latent = {} + out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) + out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) + return io.NodeOutput(out_latent) + + +class CosmosPredict2ImageToVideoLatent(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="CosmosPredict2ImageToVideoLatent_V3", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=93, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[io.Latent.Output()], + ) + + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput: + latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + if start_image is None and end_image is None: + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(out_latent) + + mask = torch.ones( + [latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], + device=comfy.model_management.intermediate_device(), + ) + + if start_image is not None: + latent_temp = vae_encode_with_padding(vae, start_image, width, height, length, padding=1) + latent[:, :, :latent_temp.shape[-3]] = latent_temp + mask[:, :, :latent_temp.shape[-3]] *= 0.0 + + if end_image is not None: + latent_temp = vae_encode_with_padding(vae, end_image, width, height, length, padding=0) + latent[:, :, -latent_temp.shape[-3]:] = latent_temp + mask[:, :, -latent_temp.shape[-3]:] *= 0.0 + + out_latent = {} + latent_format = comfy.latent_formats.Wan21() + latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) + out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) + out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) + return io.NodeOutput(out_latent) + + +class EmptyCosmosLatentVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls) -> io.SchemaV3: + return io.SchemaV3( + node_id="EmptyCosmosLatentVideo_V3", + category="latent/video", + inputs=[ + io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[io.Latent.Output()], + ) + + @classmethod + def execute(cls, width, height, length, batch_size) -> io.NodeOutput: + latent = torch.zeros( + [batch_size, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device() + ) + return io.NodeOutput({"samples": latent}) + + +NODES_LIST = [ + CosmosImageToVideoLatent, + CosmosPredict2ImageToVideoLatent, + EmptyCosmosLatentVideo, +] diff --git a/nodes.py b/nodes.py index 16a8e571e..fc813f3a2 100644 --- a/nodes.py +++ b/nodes.py @@ -2302,10 +2302,17 @@ def init_builtin_extra_nodes(): "v3/nodes_ace.py", "v3/nodes_advanced_samplers.py", "v3/nodes_align_your_steps.py", - "v3/nodes_audio.py", "v3/nodes_apg.py", "v3/nodes_attention_multiply.py", + "v3/nodes_audio.py", + "v3/nodes_camera_trajectory.py", + "v3/nodes_canny.py", + "v3/nodes_cfg.py", + "v3/nodes_clip_sdxl.py", + "v3/nodes_compositing.py", + "v3/nodes_cond.py", "v3/nodes_controlnet.py", + "v3/nodes_cosmos.py", "v3/nodes_images.py", "v3/nodes_mask.py", "v3/nodes_preview_any.py",