From 36e827772428cf85ba6c5bb5af75ea5f6bacc60b Mon Sep 17 00:00:00 2001 From: bigcat88 Date: Sat, 19 Jul 2025 07:47:09 +0300 Subject: [PATCH 1/3] v3: converted nodes_video --- comfy_api/v3/ui.py | 8 ++ comfy_extras/v3/nodes_video.py | 210 +++++++++++++++++++++++++++++++++ nodes.py | 1 + 3 files changed, 219 insertions(+) create mode 100644 comfy_extras/v3/nodes_video.py diff --git a/comfy_api/v3/ui.py b/comfy_api/v3/ui.py index 390b986d4..8c74ad8b9 100644 --- a/comfy_api/v3/ui.py +++ b/comfy_api/v3/ui.py @@ -230,6 +230,14 @@ class PreviewAudio(_UIOutput): return {"audio": self.values} +class PreviewVideo(_UIOutput): + def __init__(self, values: list[SavedResult | dict], **kwargs): + self.values = values + + def as_dict(self): + return {"images": self.values, "animated": (True,)} + + class PreviewUI3D(_UIOutput): def __init__(self, values: list[SavedResult | dict], **kwargs): self.values = values diff --git a/comfy_extras/v3/nodes_video.py b/comfy_extras/v3/nodes_video.py new file mode 100644 index 000000000..87cbe55fa --- /dev/null +++ b/comfy_extras/v3/nodes_video.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import json +import os +from fractions import Fraction + +import av +import torch + +import folder_paths +from comfy.cli_args import args +from comfy_api.input import AudioInput, ImageInput, VideoInput +from comfy_api.input_impl import VideoFromComponents, VideoFromFile +from comfy_api.util import VideoCodec, VideoComponents, VideoContainer +from comfy_api.v3 import io, ui + + +class CreateVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="CreateVideo_V3", + display_name="Create Video _V3", + category="image/video", + description="Create a video from images.", + inputs=[ + io.Image.Input("images", tooltip="The images to create a video from."), + io.Float.Input("fps", default=30.0, min=1.0, max=120.0, step=1.0), + io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."), + ], + outputs=[ + io.Video.Output("video"), + ], + ) + + @classmethod + def execute(cls, images: ImageInput, fps: float, audio: AudioInput = None): + return io.NodeOutput(VideoFromComponents( + VideoComponents( + images=images, + audio=audio, + frame_rate=Fraction(fps), + ) + )) + + +class GetVideoComponents(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="GetVideoComponents_V3", + display_name="Get Video Components _V3", + category="image/video", + description="Extracts all components from a video: frames, audio, and framerate.", + inputs=[ + io.Video.Input("video", tooltip="The video to extract components from."), + ], + outputs=[ + io.Image.Output("images"), + io.Audio.Output("audio"), + io.Float.Output("fps"), + ], + ) + + @classmethod + def execute(cls, video: VideoInput): + components = video.get_components() + return io.NodeOutput(components.images, components.audio, float(components.frame_rate)) + + +class LoadVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + input_dir = folder_paths.get_input_directory() + files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] + files = folder_paths.filter_files_content_types(files, ["video"]) + return io.SchemaV3( + node_id="LoadVideo_V3", + display_name="Load Video _V3", + category="image/video", + inputs=[ + io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video), + ], + outputs=[ + io.Video.Output(), + ], + ) + + @classmethod + def execute(cls, file): + video_path = folder_paths.get_annotated_filepath(file) + return io.NodeOutput(VideoFromFile(video_path)) + + @classmethod + def fingerprint_inputs(s, file): + video_path = folder_paths.get_annotated_filepath(file) + mod_time = os.path.getmtime(video_path) + # Instead of hashing the file, we can just use the modification time to avoid rehashing large files. + return mod_time + + @classmethod + def validate_inputs(s, file): + if not folder_paths.exists_annotated_filepath(file): + return "Invalid video file: {}".format(file) + return True + + +class SaveVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="SaveVideo_V3", + display_name="Save Video _V3", + category="image/video", + description="Saves the input images to your ComfyUI output directory.", + inputs=[ + io.Video.Input("video", tooltip="The video to save."), + io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."), + io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."), + io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) + + @classmethod + def execute(cls, video: VideoInput, filename_prefix, format, codec): + width, height = video.get_dimensions() + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( + filename_prefix, + folder_paths.get_output_directory(), + width, + height + ) + saved_metadata = None + if not args.disable_metadata: + metadata = {} + if cls.hidden.extra_pnginfo is not None: + metadata.update(cls.hidden.extra_pnginfo) + if cls.hidden.prompt is not None: + metadata["prompt"] = cls.hidden.prompt + if len(metadata) > 0: + saved_metadata = metadata + file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" + video.save_to( + os.path.join(full_output_folder, file), + format=format, + codec=codec, + metadata=saved_metadata + ) + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) + + +class SaveWEBM(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="SaveWEBM_V3", + category="image/video", + is_experimental=True, + inputs=[ + io.Image.Input("images"), + io.String.Input("filename_prefix", default="ComfyUI"), + io.Combo.Input("codec", options=["vp9", "av1"]), + io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01), + io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) + + @classmethod + def execute(cls, images, codec, fps, filename_prefix, crf): + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( + filename_prefix, folder_paths.get_output_directory(), images[0].shape[1], images[0].shape[0] + ) + + file = f"{filename}_{counter:05}_.webm" + container = av.open(os.path.join(full_output_folder, file), mode="w") + + if cls.hidden.prompt is not None: + container.metadata["prompt"] = json.dumps(cls.hidden.prompt) + + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + container.metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) + + codec_map = {"vp9": "libvpx-vp9", "av1": "libsvtav1"} + stream = container.add_stream(codec_map[codec], rate=Fraction(round(fps * 1000), 1000)) + stream.width = images.shape[-2] + stream.height = images.shape[-3] + stream.pix_fmt = "yuv420p10le" if codec == "av1" else "yuv420p" + stream.bit_rate = 0 + stream.options = {'crf': str(crf)} + if codec == "av1": + stream.options["preset"] = "6" + + for frame in images: + frame = av.VideoFrame.from_ndarray(torch.clamp(frame[..., :3] * 255, min=0, max=255).to(device=torch.device("cpu"), dtype=torch.uint8).numpy(), format="rgb24") + for packet in stream.encode(frame): + container.mux(packet) + container.mux(stream.encode()) + container.close() + + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) + + +NODES_LIST = [CreateVideo, GetVideoComponents, LoadVideo, SaveVideo, SaveWEBM] diff --git a/nodes.py b/nodes.py index 50956edb8..21bfbec7e 100644 --- a/nodes.py +++ b/nodes.py @@ -2324,6 +2324,7 @@ def init_builtin_extra_nodes(): "v3/nodes_primitive.py", "v3/nodes_rebatch.py", "v3/nodes_stable_cascade.py", + "v3/nodes_video.py", "v3/nodes_webcam.py", ] From 9e37b5420bd6ac3e4ecb52fe260a823ddcec4091 Mon Sep 17 00:00:00 2001 From: bigcat88 Date: Sat, 19 Jul 2025 08:53:47 +0300 Subject: [PATCH 2/3] v3: converted nodes_wan.py --- comfy_extras/v3/nodes_wan.py | 437 +++++++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 438 insertions(+) create mode 100644 comfy_extras/v3/nodes_wan.py diff --git a/comfy_extras/v3/nodes_wan.py b/comfy_extras/v3/nodes_wan.py new file mode 100644 index 000000000..9c1d94aa9 --- /dev/null +++ b/comfy_extras/v3/nodes_wan.py @@ -0,0 +1,437 @@ +from __future__ import annotations + +import torch + +import comfy.clip_vision +import comfy.latent_formats +import comfy.model_management +import comfy.utils +import node_helpers +import nodes +from comfy_api.v3 import io + + +class TrimVideoLatent(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="TrimVideoLatent_V3", + category="latent/video", + is_experimental=True, + inputs=[ + io.Latent.Input("samples"), + io.Int.Input("trim_amount", default=0, min=0, max=99999), + ], + outputs=[ + io.Latent.Output(), + ], + ) + + @classmethod + def execute(cls, samples, trim_amount): + samples_out = samples.copy() + + s1 = samples["samples"] + samples_out["samples"] = s1[:, :, trim_amount:] + return io.NodeOutput(samples_out) + + +class WanCameraImageToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanCameraImageToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.WanCameraEmbedding.Input("camera_conditions", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(start_image[:, :, :, :3]) + concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent}) + + if camera_conditions is not None: + positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions}) + negative = node_helpers.conditioning_set_values(negative, {'camera_conditions': camera_conditions}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanFirstLastFrameToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanFirstLastFrameToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_start_image", optional=True), + io.ClipVisionOutput.Input("clip_vision_end_image", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + if end_image is not None: + end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + image = torch.ones((length, height, width, 3)) * 0.5 + mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) + + if start_image is not None: + image[:start_image.shape[0]] = start_image + mask[:, :, :start_image.shape[0] + 3] = 0.0 + + if end_image is not None: + image[-end_image.shape[0]:] = end_image + mask[:, :, -end_image.shape[0]:] = 0.0 + + concat_latent_image = vae.encode(image[:, :, :, :3]) + mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + clip_vision_output = None + if clip_vision_start_image is not None: + clip_vision_output = clip_vision_start_image + + if clip_vision_end_image is not None: + if clip_vision_output is not None: + states = torch.cat([clip_vision_output.penultimate_hidden_states, clip_vision_end_image.penultimate_hidden_states], dim=-2) + clip_vision_output = comfy.clip_vision.Output() + clip_vision_output.penultimate_hidden_states = states + else: + clip_vision_output = clip_vision_end_image + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanFunControlToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanFunControlToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(start_image[:, :, :, :3]) + concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(control_video[:, :, :, :3]) + concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanFunInpaintToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanFunInpaintToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None): + flfv = WanFirstLastFrameToVideo() + return flfv.execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output) + + +class WanImageToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanImageToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + image[:start_image.shape[0]] = start_image + + concat_latent_image = vae.encode(image[:, :, :, :3]) + mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) + mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanPhantomSubjectToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanPhantomSubjectToVideo_V3", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("images", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_text", display_name="negative"), + io.Conditioning.Output("negative_img_text", display_name="negative_img_text"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, images): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + cond2 = negative + if images is not None: + images = comfy.utils.common_upscale(images[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + latent_images = [] + for i in images: + latent_images += [vae.encode(i.unsqueeze(0)[:, :, :, :3])] + concat_latent_image = torch.cat(latent_images, dim=2) + + positive = node_helpers.conditioning_set_values(positive, {"time_dim_concat": concat_latent_image}) + cond2 = node_helpers.conditioning_set_values(negative, {"time_dim_concat": concat_latent_image}) + negative = node_helpers.conditioning_set_values(negative, {"time_dim_concat": comfy.latent_formats.Wan21().process_out(torch.zeros_like(concat_latent_image))}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, cond2, negative, out_latent) + + +class WanVaceToVideo(io.ComfyNodeV3): + @classmethod + def define_schema(cls): + return io.SchemaV3( + node_id="WanVaceToVideo_V3", + category="conditioning/video_models", + is_experimental=True, + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("strength", default=1.0, min=0.0, max=1000.0, step=0.01), + io.Image.Input("control_video", optional=True), + io.Mask.Input("control_masks", optional=True), + io.Image.Input("reference_image", optional=True), + ], + outputs=[ + io.Conditioning.Output("positive_out", display_name="positive"), + io.Conditioning.Output("negative_out", display_name="negative"), + io.Latent.Output(display_name="latent"), + io.Int.Output(display_name="trim_latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None): + latent_length = ((length - 1) // 4) + 1 + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + if control_video.shape[0] < length: + control_video = torch.nn.functional.pad(control_video, (0, 0, 0, 0, 0, 0, 0, length - control_video.shape[0]), value=0.5) + else: + control_video = torch.ones((length, height, width, 3)) * 0.5 + + if reference_image is not None: + reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + reference_image = vae.encode(reference_image[:, :, :, :3]) + reference_image = torch.cat([reference_image, comfy.latent_formats.Wan21().process_out(torch.zeros_like(reference_image))], dim=1) + + if control_masks is None: + mask = torch.ones((length, height, width, 1)) + else: + mask = control_masks + if mask.ndim == 3: + mask = mask.unsqueeze(1) + mask = comfy.utils.common_upscale(mask[:length], width, height, "bilinear", "center").movedim(1, -1) + if mask.shape[0] < length: + mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, 0, 0, length - mask.shape[0]), value=1.0) + + control_video = control_video - 0.5 + inactive = (control_video * (1 - mask)) + 0.5 + reactive = (control_video * mask) + 0.5 + + inactive = vae.encode(inactive[:, :, :, :3]) + reactive = vae.encode(reactive[:, :, :, :3]) + control_video_latent = torch.cat((inactive, reactive), dim=1) + if reference_image is not None: + control_video_latent = torch.cat((reference_image, control_video_latent), dim=2) + + vae_stride = 8 + height_mask = height // vae_stride + width_mask = width // vae_stride + mask = mask.view(length, height_mask, vae_stride, width_mask, vae_stride) + mask = mask.permute(2, 4, 0, 1, 3) + mask = mask.reshape(vae_stride * vae_stride, length, height_mask, width_mask) + mask = torch.nn.functional.interpolate(mask.unsqueeze(0), size=(latent_length, height_mask, width_mask), mode='nearest-exact').squeeze(0) + + trim_latent = 0 + if reference_image is not None: + mask_pad = torch.zeros_like(mask[:, :reference_image.shape[2], :, :]) + mask = torch.cat((mask_pad, mask), dim=1) + latent_length += reference_image.shape[2] + trim_latent = reference_image.shape[2] + + mask = mask.unsqueeze(0) + + positive = node_helpers.conditioning_set_values(positive, {"vace_frames": [control_video_latent], "vace_mask": [mask], "vace_strength": [strength]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"vace_frames": [control_video_latent], "vace_mask": [mask], "vace_strength": [strength]}, append=True) + + latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent, trim_latent) + + +NODES_LIST = [ + TrimVideoLatent, + WanCameraImageToVideo, + WanFirstLastFrameToVideo, + WanFunControlToVideo, + WanFunInpaintToVideo, + WanImageToVideo, + WanPhantomSubjectToVideo, + WanVaceToVideo, +] diff --git a/nodes.py b/nodes.py index 21bfbec7e..4a7cb83c3 100644 --- a/nodes.py +++ b/nodes.py @@ -2325,6 +2325,7 @@ def init_builtin_extra_nodes(): "v3/nodes_rebatch.py", "v3/nodes_stable_cascade.py", "v3/nodes_video.py", + "v3/nodes_wan.py", "v3/nodes_webcam.py", ] From f15c63c37d60e0790e84a15c64afa0fd47d79bf1 Mon Sep 17 00:00:00 2001 From: bigcat88 Date: Sun, 20 Jul 2025 06:55:45 +0300 Subject: [PATCH 3/3] removed `id` from outputs --- comfy_extras/v3/nodes_video.py | 8 ++++---- comfy_extras/v3/nodes_wan.py | 30 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/comfy_extras/v3/nodes_video.py b/comfy_extras/v3/nodes_video.py index 87cbe55fa..e4adf4c29 100644 --- a/comfy_extras/v3/nodes_video.py +++ b/comfy_extras/v3/nodes_video.py @@ -29,7 +29,7 @@ class CreateVideo(io.ComfyNodeV3): io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."), ], outputs=[ - io.Video.Output("video"), + io.Video.Output(), ], ) @@ -56,9 +56,9 @@ class GetVideoComponents(io.ComfyNodeV3): io.Video.Input("video", tooltip="The video to extract components from."), ], outputs=[ - io.Image.Output("images"), - io.Audio.Output("audio"), - io.Float.Output("fps"), + io.Image.Output(display_name="images"), + io.Audio.Output(display_name="audio"), + io.Float.Output(display_name="fps"), ], ) diff --git a/comfy_extras/v3/nodes_wan.py b/comfy_extras/v3/nodes_wan.py index 9c1d94aa9..a1f3f5461 100644 --- a/comfy_extras/v3/nodes_wan.py +++ b/comfy_extras/v3/nodes_wan.py @@ -55,8 +55,8 @@ class WanCameraImageToVideo(io.ComfyNodeV3): io.WanCameraEmbedding.Input("camera_conditions", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], ) @@ -108,8 +108,8 @@ class WanFirstLastFrameToVideo(io.ComfyNodeV3): io.Image.Input("end_image", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], ) @@ -178,8 +178,8 @@ class WanFunControlToVideo(io.ComfyNodeV3): io.Image.Input("control_video", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], ) @@ -232,8 +232,8 @@ class WanFunInpaintToVideo(io.ComfyNodeV3): io.Image.Input("end_image", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], ) @@ -262,8 +262,8 @@ class WanImageToVideo(io.ComfyNodeV3): io.Image.Input("start_image", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], ) @@ -309,9 +309,9 @@ class WanPhantomSubjectToVideo(io.ComfyNodeV3): io.Image.Input("images", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_text", display_name="negative"), - io.Conditioning.Output("negative_img_text", display_name="negative_img_text"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative_text"), + io.Conditioning.Output(display_name="negative_img_text"), io.Latent.Output(display_name="latent"), ], ) @@ -357,8 +357,8 @@ class WanVaceToVideo(io.ComfyNodeV3): io.Image.Input("reference_image", optional=True), ], outputs=[ - io.Conditioning.Output("positive_out", display_name="positive"), - io.Conditioning.Output("negative_out", display_name="negative"), + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), io.Int.Output(display_name="trim_latent"), ],