mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-07-27 16:26:39 +00:00
Merge pull request #8964 from bigcat88/v3/nodes/video-save
[V3] SaveVideo, LoadVideo, SaveWEBM, WAN nodes
This commit is contained in:
commit
6a77eb15bc
@ -361,6 +361,14 @@ class PreviewAudio(_UIOutput):
|
|||||||
return {"audio": self.values}
|
return {"audio": self.values}
|
||||||
|
|
||||||
|
|
||||||
|
class PreviewVideo(_UIOutput):
|
||||||
|
def __init__(self, values: list[SavedResult | dict], **kwargs):
|
||||||
|
self.values = values
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
return {"images": self.values, "animated": (True,)}
|
||||||
|
|
||||||
|
|
||||||
class PreviewUI3D(_UIOutput):
|
class PreviewUI3D(_UIOutput):
|
||||||
def __init__(self, values: list[SavedResult | dict], **kwargs):
|
def __init__(self, values: list[SavedResult | dict], **kwargs):
|
||||||
self.values = values
|
self.values = values
|
||||||
|
210
comfy_extras/v3/nodes_video.py
Normal file
210
comfy_extras/v3/nodes_video.py
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from fractions import Fraction
|
||||||
|
|
||||||
|
import av
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import folder_paths
|
||||||
|
from comfy.cli_args import args
|
||||||
|
from comfy_api.input import AudioInput, ImageInput, VideoInput
|
||||||
|
from comfy_api.input_impl import VideoFromComponents, VideoFromFile
|
||||||
|
from comfy_api.util import VideoCodec, VideoComponents, VideoContainer
|
||||||
|
from comfy_api.v3 import io, ui
|
||||||
|
|
||||||
|
|
||||||
|
class CreateVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="CreateVideo_V3",
|
||||||
|
display_name="Create Video _V3",
|
||||||
|
category="image/video",
|
||||||
|
description="Create a video from images.",
|
||||||
|
inputs=[
|
||||||
|
io.Image.Input("images", tooltip="The images to create a video from."),
|
||||||
|
io.Float.Input("fps", default=30.0, min=1.0, max=120.0, step=1.0),
|
||||||
|
io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Video.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, images: ImageInput, fps: float, audio: AudioInput = None):
|
||||||
|
return io.NodeOutput(VideoFromComponents(
|
||||||
|
VideoComponents(
|
||||||
|
images=images,
|
||||||
|
audio=audio,
|
||||||
|
frame_rate=Fraction(fps),
|
||||||
|
)
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
class GetVideoComponents(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="GetVideoComponents_V3",
|
||||||
|
display_name="Get Video Components _V3",
|
||||||
|
category="image/video",
|
||||||
|
description="Extracts all components from a video: frames, audio, and framerate.",
|
||||||
|
inputs=[
|
||||||
|
io.Video.Input("video", tooltip="The video to extract components from."),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Image.Output(display_name="images"),
|
||||||
|
io.Audio.Output(display_name="audio"),
|
||||||
|
io.Float.Output(display_name="fps"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, video: VideoInput):
|
||||||
|
components = video.get_components()
|
||||||
|
return io.NodeOutput(components.images, components.audio, float(components.frame_rate))
|
||||||
|
|
||||||
|
|
||||||
|
class LoadVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
input_dir = folder_paths.get_input_directory()
|
||||||
|
files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
|
||||||
|
files = folder_paths.filter_files_content_types(files, ["video"])
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="LoadVideo_V3",
|
||||||
|
display_name="Load Video _V3",
|
||||||
|
category="image/video",
|
||||||
|
inputs=[
|
||||||
|
io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Video.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, file):
|
||||||
|
video_path = folder_paths.get_annotated_filepath(file)
|
||||||
|
return io.NodeOutput(VideoFromFile(video_path))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fingerprint_inputs(s, file):
|
||||||
|
video_path = folder_paths.get_annotated_filepath(file)
|
||||||
|
mod_time = os.path.getmtime(video_path)
|
||||||
|
# Instead of hashing the file, we can just use the modification time to avoid rehashing large files.
|
||||||
|
return mod_time
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate_inputs(s, file):
|
||||||
|
if not folder_paths.exists_annotated_filepath(file):
|
||||||
|
return "Invalid video file: {}".format(file)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class SaveVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="SaveVideo_V3",
|
||||||
|
display_name="Save Video _V3",
|
||||||
|
category="image/video",
|
||||||
|
description="Saves the input images to your ComfyUI output directory.",
|
||||||
|
inputs=[
|
||||||
|
io.Video.Input("video", tooltip="The video to save."),
|
||||||
|
io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."),
|
||||||
|
io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."),
|
||||||
|
io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."),
|
||||||
|
],
|
||||||
|
outputs=[],
|
||||||
|
hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo],
|
||||||
|
is_output_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, video: VideoInput, filename_prefix, format, codec):
|
||||||
|
width, height = video.get_dimensions()
|
||||||
|
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
|
||||||
|
filename_prefix,
|
||||||
|
folder_paths.get_output_directory(),
|
||||||
|
width,
|
||||||
|
height
|
||||||
|
)
|
||||||
|
saved_metadata = None
|
||||||
|
if not args.disable_metadata:
|
||||||
|
metadata = {}
|
||||||
|
if cls.hidden.extra_pnginfo is not None:
|
||||||
|
metadata.update(cls.hidden.extra_pnginfo)
|
||||||
|
if cls.hidden.prompt is not None:
|
||||||
|
metadata["prompt"] = cls.hidden.prompt
|
||||||
|
if len(metadata) > 0:
|
||||||
|
saved_metadata = metadata
|
||||||
|
file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}"
|
||||||
|
video.save_to(
|
||||||
|
os.path.join(full_output_folder, file),
|
||||||
|
format=format,
|
||||||
|
codec=codec,
|
||||||
|
metadata=saved_metadata
|
||||||
|
)
|
||||||
|
return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)]))
|
||||||
|
|
||||||
|
|
||||||
|
class SaveWEBM(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="SaveWEBM_V3",
|
||||||
|
category="image/video",
|
||||||
|
is_experimental=True,
|
||||||
|
inputs=[
|
||||||
|
io.Image.Input("images"),
|
||||||
|
io.String.Input("filename_prefix", default="ComfyUI"),
|
||||||
|
io.Combo.Input("codec", options=["vp9", "av1"]),
|
||||||
|
io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01),
|
||||||
|
io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."),
|
||||||
|
],
|
||||||
|
outputs=[],
|
||||||
|
hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo],
|
||||||
|
is_output_node=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, images, codec, fps, filename_prefix, crf):
|
||||||
|
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
|
||||||
|
filename_prefix, folder_paths.get_output_directory(), images[0].shape[1], images[0].shape[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
file = f"{filename}_{counter:05}_.webm"
|
||||||
|
container = av.open(os.path.join(full_output_folder, file), mode="w")
|
||||||
|
|
||||||
|
if cls.hidden.prompt is not None:
|
||||||
|
container.metadata["prompt"] = json.dumps(cls.hidden.prompt)
|
||||||
|
|
||||||
|
if cls.hidden.extra_pnginfo is not None:
|
||||||
|
for x in cls.hidden.extra_pnginfo:
|
||||||
|
container.metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
|
||||||
|
|
||||||
|
codec_map = {"vp9": "libvpx-vp9", "av1": "libsvtav1"}
|
||||||
|
stream = container.add_stream(codec_map[codec], rate=Fraction(round(fps * 1000), 1000))
|
||||||
|
stream.width = images.shape[-2]
|
||||||
|
stream.height = images.shape[-3]
|
||||||
|
stream.pix_fmt = "yuv420p10le" if codec == "av1" else "yuv420p"
|
||||||
|
stream.bit_rate = 0
|
||||||
|
stream.options = {'crf': str(crf)}
|
||||||
|
if codec == "av1":
|
||||||
|
stream.options["preset"] = "6"
|
||||||
|
|
||||||
|
for frame in images:
|
||||||
|
frame = av.VideoFrame.from_ndarray(torch.clamp(frame[..., :3] * 255, min=0, max=255).to(device=torch.device("cpu"), dtype=torch.uint8).numpy(), format="rgb24")
|
||||||
|
for packet in stream.encode(frame):
|
||||||
|
container.mux(packet)
|
||||||
|
container.mux(stream.encode())
|
||||||
|
container.close()
|
||||||
|
|
||||||
|
return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)]))
|
||||||
|
|
||||||
|
|
||||||
|
NODES_LIST = [CreateVideo, GetVideoComponents, LoadVideo, SaveVideo, SaveWEBM]
|
437
comfy_extras/v3/nodes_wan.py
Normal file
437
comfy_extras/v3/nodes_wan.py
Normal file
@ -0,0 +1,437 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import comfy.clip_vision
|
||||||
|
import comfy.latent_formats
|
||||||
|
import comfy.model_management
|
||||||
|
import comfy.utils
|
||||||
|
import node_helpers
|
||||||
|
import nodes
|
||||||
|
from comfy_api.v3 import io
|
||||||
|
|
||||||
|
|
||||||
|
class TrimVideoLatent(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="TrimVideoLatent_V3",
|
||||||
|
category="latent/video",
|
||||||
|
is_experimental=True,
|
||||||
|
inputs=[
|
||||||
|
io.Latent.Input("samples"),
|
||||||
|
io.Int.Input("trim_amount", default=0, min=0, max=99999),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Latent.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, samples, trim_amount):
|
||||||
|
samples_out = samples.copy()
|
||||||
|
|
||||||
|
s1 = samples["samples"]
|
||||||
|
samples_out["samples"] = s1[:, :, trim_amount:]
|
||||||
|
return io.NodeOutput(samples_out)
|
||||||
|
|
||||||
|
|
||||||
|
class WanCameraImageToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanCameraImageToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
io.WanCameraEmbedding.Input("camera_conditions", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None):
|
||||||
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
|
||||||
|
|
||||||
|
if start_image is not None:
|
||||||
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
concat_latent_image = vae.encode(start_image[:, :, :, :3])
|
||||||
|
concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
|
||||||
|
|
||||||
|
if camera_conditions is not None:
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {'camera_conditions': camera_conditions})
|
||||||
|
|
||||||
|
if clip_vision_output is not None:
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||||
|
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class WanFirstLastFrameToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanFirstLastFrameToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_start_image", optional=True),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_end_image", optional=True),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
io.Image.Input("end_image", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None):
|
||||||
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
if start_image is not None:
|
||||||
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
if end_image is not None:
|
||||||
|
end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
|
||||||
|
image = torch.ones((length, height, width, 3)) * 0.5
|
||||||
|
mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
|
||||||
|
|
||||||
|
if start_image is not None:
|
||||||
|
image[:start_image.shape[0]] = start_image
|
||||||
|
mask[:, :, :start_image.shape[0] + 3] = 0.0
|
||||||
|
|
||||||
|
if end_image is not None:
|
||||||
|
image[-end_image.shape[0]:] = end_image
|
||||||
|
mask[:, :, -end_image.shape[0]:] = 0.0
|
||||||
|
|
||||||
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
||||||
|
mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
||||||
|
|
||||||
|
clip_vision_output = None
|
||||||
|
if clip_vision_start_image is not None:
|
||||||
|
clip_vision_output = clip_vision_start_image
|
||||||
|
|
||||||
|
if clip_vision_end_image is not None:
|
||||||
|
if clip_vision_output is not None:
|
||||||
|
states = torch.cat([clip_vision_output.penultimate_hidden_states, clip_vision_end_image.penultimate_hidden_states], dim=-2)
|
||||||
|
clip_vision_output = comfy.clip_vision.Output()
|
||||||
|
clip_vision_output.penultimate_hidden_states = states
|
||||||
|
else:
|
||||||
|
clip_vision_output = clip_vision_end_image
|
||||||
|
|
||||||
|
if clip_vision_output is not None:
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||||
|
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class WanFunControlToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanFunControlToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
io.Image.Input("control_video", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None):
|
||||||
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
|
||||||
|
concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
|
||||||
|
|
||||||
|
if start_image is not None:
|
||||||
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
concat_latent_image = vae.encode(start_image[:, :, :, :3])
|
||||||
|
concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
||||||
|
|
||||||
|
if control_video is not None:
|
||||||
|
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
concat_latent_image = vae.encode(control_video[:, :, :, :3])
|
||||||
|
concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
|
||||||
|
|
||||||
|
if clip_vision_output is not None:
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||||
|
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class WanFunInpaintToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanFunInpaintToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
io.Image.Input("end_image", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None):
|
||||||
|
flfv = WanFirstLastFrameToVideo()
|
||||||
|
return flfv.execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output)
|
||||||
|
|
||||||
|
|
||||||
|
class WanImageToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanImageToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None):
|
||||||
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
if start_image is not None:
|
||||||
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5
|
||||||
|
image[:start_image.shape[0]] = start_image
|
||||||
|
|
||||||
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
||||||
|
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
|
||||||
|
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
||||||
|
|
||||||
|
if clip_vision_output is not None:
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||||
|
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class WanPhantomSubjectToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanPhantomSubjectToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.Image.Input("images", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative_text"),
|
||||||
|
io.Conditioning.Output(display_name="negative_img_text"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, images):
|
||||||
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
cond2 = negative
|
||||||
|
if images is not None:
|
||||||
|
images = comfy.utils.common_upscale(images[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
latent_images = []
|
||||||
|
for i in images:
|
||||||
|
latent_images += [vae.encode(i.unsqueeze(0)[:, :, :, :3])]
|
||||||
|
concat_latent_image = torch.cat(latent_images, dim=2)
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"time_dim_concat": concat_latent_image})
|
||||||
|
cond2 = node_helpers.conditioning_set_values(negative, {"time_dim_concat": concat_latent_image})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"time_dim_concat": comfy.latent_formats.Wan21().process_out(torch.zeros_like(concat_latent_image))})
|
||||||
|
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, cond2, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class WanVaceToVideo(io.ComfyNodeV3):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return io.SchemaV3(
|
||||||
|
node_id="WanVaceToVideo_V3",
|
||||||
|
category="conditioning/video_models",
|
||||||
|
is_experimental=True,
|
||||||
|
inputs=[
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Vae.Input("vae"),
|
||||||
|
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.Float.Input("strength", default=1.0, min=0.0, max=1000.0, step=0.01),
|
||||||
|
io.Image.Input("control_video", optional=True),
|
||||||
|
io.Mask.Input("control_masks", optional=True),
|
||||||
|
io.Image.Input("reference_image", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
io.Int.Output(display_name="trim_latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None):
|
||||||
|
latent_length = ((length - 1) // 4) + 1
|
||||||
|
if control_video is not None:
|
||||||
|
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
if control_video.shape[0] < length:
|
||||||
|
control_video = torch.nn.functional.pad(control_video, (0, 0, 0, 0, 0, 0, 0, length - control_video.shape[0]), value=0.5)
|
||||||
|
else:
|
||||||
|
control_video = torch.ones((length, height, width, 3)) * 0.5
|
||||||
|
|
||||||
|
if reference_image is not None:
|
||||||
|
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
reference_image = vae.encode(reference_image[:, :, :, :3])
|
||||||
|
reference_image = torch.cat([reference_image, comfy.latent_formats.Wan21().process_out(torch.zeros_like(reference_image))], dim=1)
|
||||||
|
|
||||||
|
if control_masks is None:
|
||||||
|
mask = torch.ones((length, height, width, 1))
|
||||||
|
else:
|
||||||
|
mask = control_masks
|
||||||
|
if mask.ndim == 3:
|
||||||
|
mask = mask.unsqueeze(1)
|
||||||
|
mask = comfy.utils.common_upscale(mask[:length], width, height, "bilinear", "center").movedim(1, -1)
|
||||||
|
if mask.shape[0] < length:
|
||||||
|
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, 0, 0, length - mask.shape[0]), value=1.0)
|
||||||
|
|
||||||
|
control_video = control_video - 0.5
|
||||||
|
inactive = (control_video * (1 - mask)) + 0.5
|
||||||
|
reactive = (control_video * mask) + 0.5
|
||||||
|
|
||||||
|
inactive = vae.encode(inactive[:, :, :, :3])
|
||||||
|
reactive = vae.encode(reactive[:, :, :, :3])
|
||||||
|
control_video_latent = torch.cat((inactive, reactive), dim=1)
|
||||||
|
if reference_image is not None:
|
||||||
|
control_video_latent = torch.cat((reference_image, control_video_latent), dim=2)
|
||||||
|
|
||||||
|
vae_stride = 8
|
||||||
|
height_mask = height // vae_stride
|
||||||
|
width_mask = width // vae_stride
|
||||||
|
mask = mask.view(length, height_mask, vae_stride, width_mask, vae_stride)
|
||||||
|
mask = mask.permute(2, 4, 0, 1, 3)
|
||||||
|
mask = mask.reshape(vae_stride * vae_stride, length, height_mask, width_mask)
|
||||||
|
mask = torch.nn.functional.interpolate(mask.unsqueeze(0), size=(latent_length, height_mask, width_mask), mode='nearest-exact').squeeze(0)
|
||||||
|
|
||||||
|
trim_latent = 0
|
||||||
|
if reference_image is not None:
|
||||||
|
mask_pad = torch.zeros_like(mask[:, :reference_image.shape[2], :, :])
|
||||||
|
mask = torch.cat((mask_pad, mask), dim=1)
|
||||||
|
latent_length += reference_image.shape[2]
|
||||||
|
trim_latent = reference_image.shape[2]
|
||||||
|
|
||||||
|
mask = mask.unsqueeze(0)
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"vace_frames": [control_video_latent], "vace_mask": [mask], "vace_strength": [strength]}, append=True)
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"vace_frames": [control_video_latent], "vace_mask": [mask], "vace_strength": [strength]}, append=True)
|
||||||
|
|
||||||
|
latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
|
out_latent = {}
|
||||||
|
out_latent["samples"] = latent
|
||||||
|
return io.NodeOutput(positive, negative, out_latent, trim_latent)
|
||||||
|
|
||||||
|
|
||||||
|
NODES_LIST = [
|
||||||
|
TrimVideoLatent,
|
||||||
|
WanCameraImageToVideo,
|
||||||
|
WanFirstLastFrameToVideo,
|
||||||
|
WanFunControlToVideo,
|
||||||
|
WanFunInpaintToVideo,
|
||||||
|
WanImageToVideo,
|
||||||
|
WanPhantomSubjectToVideo,
|
||||||
|
WanVaceToVideo,
|
||||||
|
]
|
2
nodes.py
2
nodes.py
@ -2330,6 +2330,8 @@ def init_builtin_extra_nodes():
|
|||||||
"v3/nodes_primitive.py",
|
"v3/nodes_primitive.py",
|
||||||
"v3/nodes_rebatch.py",
|
"v3/nodes_rebatch.py",
|
||||||
"v3/nodes_stable_cascade.py",
|
"v3/nodes_stable_cascade.py",
|
||||||
|
"v3/nodes_video.py",
|
||||||
|
"v3/nodes_wan.py",
|
||||||
"v3/nodes_webcam.py",
|
"v3/nodes_webcam.py",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user