Add Veo3 video generation node with audio support (#9110)

- Create new Veo3VideoGenerationNode that extends VeoVideoGenerationNode - Add support for generateAudio parameter (only for Veo3 models) - Support new Veo3 models: veo-3.0-generate-001, veo-3.0-fast-generate-001 - Fix Veo3 duration constraint to 8 seconds only - Update original node to be clearly Veo 2 only - Update API paths to use model parameter: /proxy/veo/{model}/generate - Regenerate API types from staging to include generateAudio parameter - Fix TripoModelVersion enum reference after regeneration - Mark generated API types file in .gitattributes
2025-08-16 01:44:22 +00:00 · 2025-08-04 22:52:25 -07:00 · 2025-08-04 22:52:25 -07:00 · f69609bbd6
commit f69609bbd6
parent c012400240
4 changed files with 2664 additions and 93 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +1,3 @@
 /web/assets/** linguist-generated
 /web/** linguist-vendored
 comfy_api_nodes/apis/__init__.py linguist-generated
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
--- a/comfy_api_nodes/apis/tripo_api.py
+++ b/comfy_api_nodes/apis/tripo_api.py
@ -127,7 +127,7 @@ class TripoTextToModelRequest(BaseModel):
    type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description='Type of task')
    prompt: str = Field(..., description='The text prompt describing the model to generate', max_length=1024)
    negative_prompt: Optional[str] = Field(None, description='The negative text prompt', max_length=1024)
-    model_version: Optional[TripoModelVersion] = TripoModelVersion.V2_5
+    model_version: Optional[TripoModelVersion] = TripoModelVersion.v2_5_20250123
    face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to')
    texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model')
    pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model')
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@ -8,10 +8,10 @@ from typing import Optional
 from comfy.comfy_types.node_typing import IO, ComfyNodeABC
 from comfy_api.input_impl.video_types import VideoFromFile
 from comfy_api_nodes.apis import (
-    Veo2GenVidRequest,
+    VeoGenVidRequest,
-    Veo2GenVidResponse,
+    VeoGenVidResponse,
-    Veo2GenVidPollRequest,
+    VeoGenVidPollRequest,
-    Veo2GenVidPollResponse
+    VeoGenVidPollResponse
 )
 from comfy_api_nodes.apis.client import (
    ApiEndpoint,
@ -35,7 +35,7 @@ def convert_image_to_base64(image: torch.Tensor):
    return tensor_to_base64_string(scaled_image)
-def get_video_url_from_response(poll_response: Veo2GenVidPollResponse) -> Optional[str]:
+def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]:
    if (
        poll_response.response
        and hasattr(poll_response.response, "videos")
@ -130,6 +130,14 @@ class VeoVideoGenerationNode(ComfyNodeABC):
                    "default": None,
                    "tooltip": "Optional reference image to guide video generation",
                }),
                "model": (
                    IO.COMBO,
                    {
                        "options": ["veo-2.0-generate-001"],
                        "default": "veo-2.0-generate-001",
                        "tooltip": "Veo 2 model to use for video generation",
                    },
                ),
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
@ -141,7 +149,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
    RETURN_TYPES = (IO.VIDEO,)
    FUNCTION = "generate_video"
    CATEGORY = "api node/video/Veo"
-    DESCRIPTION = "Generates videos from text prompts using Google's Veo API"
+    DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API"
    API_NODE = True
    def generate_video(
@ -154,6 +162,8 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        person_generation="ALLOW",
        seed=0,
        image=None,
        model="veo-2.0-generate-001",
        generate_audio=False,
        unique_id: Optional[str] = None,
        **kwargs,
    ):
@ -188,16 +198,19 @@ class VeoVideoGenerationNode(ComfyNodeABC):
            parameters["negativePrompt"] = negative_prompt
        if seed > 0:
            parameters["seed"] = seed
        # Only add generateAudio for Veo 3 models
        if "veo-3.0" in model:
            parameters["generateAudio"] = generate_audio
        # Initial request to start video generation
        initial_operation = SynchronousOperation(
            endpoint=ApiEndpoint(
-                path="/proxy/veo/generate",
+                path=f"/proxy/veo/{model}/generate",
                method=HttpMethod.POST,
-                request_model=Veo2GenVidRequest,
+                request_model=VeoGenVidRequest,
-                response_model=Veo2GenVidResponse
+                response_model=VeoGenVidResponse
            ),
-            request=Veo2GenVidRequest(
+            request=VeoGenVidRequest(
                instances=instances,
                parameters=parameters
            ),
@ -223,16 +236,16 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        # Define the polling operation
        poll_operation = PollingOperation(
            poll_endpoint=ApiEndpoint(
-                path="/proxy/veo/poll",
+                path=f"/proxy/veo/{model}/poll",
                method=HttpMethod.POST,
-                request_model=Veo2GenVidPollRequest,
+                request_model=VeoGenVidPollRequest,
-                response_model=Veo2GenVidPollResponse
+                response_model=VeoGenVidPollResponse
            ),
            completed_statuses=["completed"],
            failed_statuses=[],  # No failed statuses, we'll handle errors after polling
            status_extractor=status_extractor,
            progress_extractor=progress_extractor,
-            request=Veo2GenVidPollRequest(
+            request=VeoGenVidPollRequest(
                operationName=operation_name
            ),
            auth_kwargs=kwargs,
@ -298,11 +311,64 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        return (VideoFromFile(video_io),)
-# Register the node
+class Veo3VideoGenerationNode(VeoVideoGenerationNode):
    """
    Generates videos from text prompts using Google's Veo 3 API.
    Supported models:
    - veo-3.0-generate-001
    - veo-3.0-fast-generate-001
    This node extends the base Veo node with Veo 3 specific features including
    audio generation and fixed 8-second duration.
    """
    @classmethod
    def INPUT_TYPES(s):
        parent_input = super().INPUT_TYPES()
        # Update model options for Veo 3
        parent_input["optional"]["model"] = (
            IO.COMBO,
            {
                "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"],
                "default": "veo-3.0-generate-001",
                "tooltip": "Veo 3 model to use for video generation",
            },
        )
        # Add generateAudio parameter
        parent_input["optional"]["generate_audio"] = (
            IO.BOOLEAN,
            {
                "default": False,
                "tooltip": "Generate audio for the video. Supported by all Veo 3 models.",
            }
        )
        # Update duration constraints for Veo 3 (only 8 seconds supported)
        parent_input["optional"]["duration_seconds"] = (
            IO.INT,
            {
                "default": 8,
                "min": 8,
                "max": 8,
                "step": 1,
                "display": "number",
                "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
            },
        )
        return parent_input
 # Register the nodes
 NODE_CLASS_MAPPINGS = {
    "VeoVideoGenerationNode": VeoVideoGenerationNode,
    "Veo3VideoGenerationNode": Veo3VideoGenerationNode,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "VeoVideoGenerationNode": "Google Veo2 Video Generation",
+    "VeoVideoGenerationNode": "Google Veo 2 Video Generation",
    "Veo3VideoGenerationNode": "Google Veo 3 Video Generation",
 }