feat(api-nodes): add Vidu Video nodes (#9368)

2025-09-10 19:46:38 +00:00 · 2025-08-19 23:30:06 +03:00
parent d844d8b13b
commit 54d8fdbed0
3 changed files with 676 additions and 0 deletions
--- a/comfy_api_nodes/nodes_vidu.py
+++ b/comfy_api_nodes/nodes_vidu.py
@@ -0,0 +1,622 @@
+import logging
+from enum import Enum
+from typing import Any, Callable, Optional, Literal, TypeVar
+from typing_extensions import override
+
+import torch
+from pydantic import BaseModel, Field
+
+from comfy_api.latest import ComfyExtension, io as comfy_io
+from comfy_api_nodes.util.validation_utils import (
+    validate_aspect_ratio_closeness,
+    validate_image_dimensions,
+    validate_image_aspect_ratio_range,
+    get_number_of_images,
+)
+from comfy_api_nodes.apis.client import (
+    ApiEndpoint,
+    HttpMethod,
+    SynchronousOperation,
+    PollingOperation,
+    EmptyRequest,
+)
+from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi
+
+
+VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video"
+VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video"
+VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video"
+VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video"
+VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations"
+
+R = TypeVar("R")
+
+class VideoModelName(str, Enum):
+    vidu_q1 = 'viduq1'
+
+
+class AspectRatio(str, Enum):
+    r_16_9 = "16:9"
+    r_9_16 = "9:16"
+    r_1_1 = "1:1"
+
+
+class Resolution(str, Enum):
+    r_1080p = "1080p"
+
+
+class MovementAmplitude(str, Enum):
+    auto = "auto"
+    small = "small"
+    medium = "medium"
+    large = "large"
+
+
+class TaskCreationRequest(BaseModel):
+    model: VideoModelName = VideoModelName.vidu_q1
+    prompt: Optional[str] = Field(None, max_length=1500)
+    duration: Optional[Literal[5]] = 5
+    seed: Optional[int] = Field(0, ge=0, le=2147483647)
+    aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9
+    resolution: Optional[Resolution] = Resolution.r_1080p
+    movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto
+    images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL")
+
+
+class TaskStatus(str, Enum):
+    created = "created"
+    queueing = "queueing"
+    processing = "processing"
+    success = "success"
+    failed = "failed"
+
+
+class TaskCreationResponse(BaseModel):
+    task_id: str = Field(...)
+    state: TaskStatus = Field(...)
+    created_at: str = Field(...)
+    code: Optional[int] = Field(None, description="Error code")
+
+
+class TaskResult(BaseModel):
+    id: str = Field(..., description="Creation id")
+    url: str = Field(..., description="The URL of the generated results, valid for one hour")
+    cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour")
+
+
+class TaskStatusResponse(BaseModel):
+    state: TaskStatus = Field(...)
+    err_code: Optional[str] = Field(None)
+    creations: list[TaskResult] = Field(..., description="Generated results")
+
+
+async def poll_until_finished(
+    auth_kwargs: dict[str, str],
+    api_endpoint: ApiEndpoint[Any, R],
+    result_url_extractor: Optional[Callable[[R], str]] = None,
+    estimated_duration: Optional[int] = None,
+    node_id: Optional[str] = None,
+) -> R:
+    return await PollingOperation(
+        poll_endpoint=api_endpoint,
+        completed_statuses=[TaskStatus.success.value],
+        failed_statuses=[TaskStatus.failed.value],
+        status_extractor=lambda response: response.state.value,
+        auth_kwargs=auth_kwargs,
+        result_url_extractor=result_url_extractor,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+        poll_interval=16.0,
+        max_poll_attempts=256,
+    ).execute()
+
+
+def get_video_url_from_response(response) -> Optional[str]:
+    if response.creations:
+        return response.creations[0].url
+    return None
+
+
+def get_video_from_response(response) -> TaskResult:
+    if not response.creations:
+        error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}"
+        logging.info(error_msg)
+        raise RuntimeError(error_msg)
+    logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url)
+    return response.creations[0]
+
+
+async def execute_task(
+    vidu_endpoint: str,
+    auth_kwargs: Optional[dict[str, str]],
+    payload: TaskCreationRequest,
+    estimated_duration: int,
+    node_id: str,
+) -> R:
+    response = await SynchronousOperation(
+        endpoint=ApiEndpoint(
+            path=vidu_endpoint,
+            method=HttpMethod.POST,
+            request_model=TaskCreationRequest,
+            response_model=TaskCreationResponse,
+        ),
+        request=payload,
+        auth_kwargs=auth_kwargs,
+    ).execute()
+    if response.state == TaskStatus.failed:
+        error_msg = f"Vidu request failed. Code: {response.code}"
+        logging.error(error_msg)
+        raise RuntimeError(error_msg)
+    return await poll_until_finished(
+        auth_kwargs,
+        ApiEndpoint(
+            path=VIDU_GET_GENERATION_STATUS % response.task_id,
+            method=HttpMethod.GET,
+            request_model=EmptyRequest,
+            response_model=TaskStatusResponse,
+        ),
+        result_url_extractor=get_video_url_from_response,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+    )
+
+
+class ViduTextToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduTextToVideoNode",
+            display_name="Vidu Text To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from text prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduImageToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduImageToVideoNode",
+            display_name="Vidu Image To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from image and optional prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "image",
+                    tooltip="An image to be used as the start frame of the generated video",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if get_number_of_images(image) > 1:
+            raise ValueError("Only one input image is allowed.")
+        validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            image,
+            max_images=1,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduReferenceVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduReferenceVideoNode",
+            display_name="Vidu Reference To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from multiple images and prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "images",
+                    tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        images: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        a = get_number_of_images(images)
+        if a > 7:
+            raise ValueError("Too many images, maximum allowed is 7.")
+        for image in images:
+            validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+            validate_image_dimensions(image, min_width=128, min_height=128)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            images,
+            max_images=7,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduStartEndToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduStartEndToVideoNode",
+            display_name="Vidu Start End To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate a video from start and end frames and a prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "first_frame",
+                    tooltip="Start frame",
+                ),
+                comfy_io.Image.Input(
+                    "end_frame",
+                    tooltip="End frame",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        first_frame: torch.Tensor,
+        end_frame: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = [
+            (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0]
+            for frame in (first_frame, end_frame)
+        ]
+        results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
+        return [
+            ViduTextToVideoNode,
+            ViduImageToVideoNode,
+            ViduReferenceVideoNode,
+            ViduStartEndToVideoNode,
+        ]
+
+async def comfy_entrypoint() -> ViduExtension:
+    return ViduExtension()
--- a/comfy_api_nodes/util/validation_utils.py
+++ b/comfy_api_nodes/util/validation_utils.py
@@ -53,6 +53,53 @@ def validate_image_aspect_ratio(
        )


+def validate_image_aspect_ratio_range(
+    image: torch.Tensor,
+    min_ratio: tuple[float, float],  # e.g. (1, 4)
+    max_ratio: tuple[float, float],  # e.g. (4, 1)
+    *,
+    strict: bool = True,             # True -> (min, max); False -> [min, max]
+) -> float:
+    a1, b1 = min_ratio
+    a2, b2 = max_ratio
+    if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0:
+        raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).")
+    lo, hi = (a1 / b1), (a2 / b2)
+    if lo > hi:
+        lo, hi = hi, lo
+        a1, b1, a2, b2 = a2, b2, a1, b1  # swap only for error text
+    w, h = get_image_dimensions(image)
+    if w <= 0 or h <= 0:
+        raise ValueError(f"Invalid image dimensions: {w}x{h}")
+    ar = w / h
+    ok = (lo < ar < hi) if strict else (lo <= ar <= hi)
+    if not ok:
+        op = "<" if strict else "≤"
+        raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}")
+    return ar
+
+
+def validate_aspect_ratio_closeness(
+    start_img,
+    end_img,
+    min_rel: float,
+    max_rel: float,
+    *,
+    strict: bool = False,   # True => exclusive, False => inclusive
+) -> None:
+    w1, h1 = get_image_dimensions(start_img)
+    w2, h2 = get_image_dimensions(end_img)
+    if min(w1, h1, w2, h2) <= 0:
+        raise ValueError("Invalid image dimensions")
+    ar1 = w1 / h1
+    ar2 = w2 / h2
+    # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1)
+    closeness = max(ar1, ar2) / min(ar1, ar2)
+    limit = max(max_rel, 1.0 / min_rel)  # for 0.8..1.25 this is 1.25
+    if (closeness >= limit) if strict else (closeness > limit):
+        raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.")
+
+
 def validate_video_dimensions(
    video: VideoInput,
    min_width: Optional[int] = None,
@@ -98,3 +145,9 @@ def validate_video_duration(
        raise ValueError(
            f"Video duration must be at most {max_duration}s, got {duration}s"
        )
+
+
+def get_number_of_images(images):
+    if isinstance(images, torch.Tensor):
+        return images.shape[0] if images.ndim >= 4 else 1
+    return len(images)
--- a/nodes.py
+++ b/nodes.py
@@ -2351,6 +2351,7 @@ async def init_builtin_api_nodes():
        "nodes_moonvalley.py",
        "nodes_rodin.py",
        "nodes_gemini.py",
+        "nodes_vidu.py",
    ]

    if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):