ComfyAPI Core v0.0.2 (#8962)

* ComfyAPI Core v0.0.2 * Respond to PR feedback * Fix Python 3.9 errors * Fix missing backward compatibility proxy * Reorganize types a bit The input types, input impls, and utility types are now all available in the versioned API. See the change in `comfy_extras/nodes_video.py` for an example of their usage. * Remove the need for `--generate-api-stubs` * Fix generated stubs differing by Python version * Fix ruff formatting issues
2025-09-13 04:55:53 +00:00 · 2025-07-29 19:17:22 -07:00
parent 2f74e17975
commit 0a3d062e06
36 changed files with 2128 additions and 518 deletions
--- a/comfy_api/latest/init.py
+++ b/comfy_api/latest/init.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import Type, TYPE_CHECKING
+from comfy_api.internal import ComfyAPIBase
+from comfy_api.internal.singleton import ProxiedSingleton
+from comfy_api.internal.async_to_sync import create_sync_class
+from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
+from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
+from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents
+from comfy_execution.utils import get_executing_context
+from comfy_execution.progress import get_progress_state, PreviewImageTuple
+from PIL import Image
+from comfy.cli_args import args
+import numpy as np
+
+
+class ComfyAPI_latest(ComfyAPIBase):
+    VERSION = "latest"
+    STABLE = False
+
+    class Execution(ProxiedSingleton):
+        async def set_progress(
+            self,
+            value: float,
+            max_value: float,
+            node_id: str | None = None,
+            preview_image: Image.Image | ImageInput | None = None,
+            ignore_size_limit: bool = False,
+        ) -> None:
+            """
+            Update the progress bar displayed in the ComfyUI interface.
+
+            This function allows custom nodes and API calls to report their progress
+            back to the user interface, providing visual feedback during long operations.
+
+            Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
+            """
+            executing_context = get_executing_context()
+            if node_id is None and executing_context is not None:
+                node_id = executing_context.node_id
+            if node_id is None:
+                raise ValueError("node_id must be provided if not in executing context")
+
+            # Convert preview_image to PreviewImageTuple if needed
+            to_display: PreviewImageTuple | Image.Image | ImageInput | None = preview_image
+            if to_display is not None:
+                # First convert to PIL Image if needed
+                if isinstance(to_display, ImageInput):
+                    # Convert ImageInput (torch.Tensor) to PIL Image
+                    # Handle tensor shape [B, H, W, C] -> get first image if batch
+                    tensor = to_display
+                    if len(tensor.shape) == 4:
+                        tensor = tensor[0]
+
+                    # Convert to numpy array and scale to 0-255
+                    image_np = (tensor.cpu().numpy() * 255).astype(np.uint8)
+                    to_display = Image.fromarray(image_np)
+
+                if isinstance(to_display, Image.Image):
+                    # Detect image format from PIL Image
+                    image_format = to_display.format if to_display.format else "JPEG"
+                    # Use None for preview_size if ignore_size_limit is True
+                    preview_size = None if ignore_size_limit else args.preview_size
+                    to_display = (image_format, to_display, preview_size)
+
+            get_progress_state().update_progress(
+                node_id=node_id,
+                value=value,
+                max_value=max_value,
+                image=to_display,
+            )
+
+    execution: Execution
+
+class Input:
+    Image = ImageInput
+    Audio = AudioInput
+    Mask = MaskInput
+    Latent = LatentInput
+    Video = VideoInput
+
+class InputImpl:
+    VideoFromFile = VideoFromFile
+    VideoFromComponents = VideoFromComponents
+
+class Types:
+    VideoCodec = VideoCodec
+    VideoContainer = VideoContainer
+    VideoComponents = VideoComponents
+
+ComfyAPI = ComfyAPI_latest
+
+# Create a synchronous version of the API
+if TYPE_CHECKING:
+    import comfy_api.latest.generated.ComfyAPISyncStub  # type: ignore
+
+    ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub]
+ComfyAPISync = create_sync_class(ComfyAPI_latest)
+
+__all__ = [
+    "ComfyAPI",
+    "ComfyAPISync",
+    "Input",
+    "InputImpl",
+    "Types",
+]
--- a/comfy_api/latest/_input/init.py
+++ b/comfy_api/latest/_input/init.py
@@ -0,0 +1,10 @@
+from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
+from .video_types import VideoInput
+
+__all__ = [
+    "ImageInput",
+    "AudioInput",
+    "VideoInput",
+    "MaskInput",
+    "LatentInput",
+]
--- a/comfy_api/latest/_input/basic_types.py
+++ b/comfy_api/latest/_input/basic_types.py
@@ -0,0 +1,42 @@
+import torch
+from typing import TypedDict, List, Optional
+
+ImageInput = torch.Tensor
+"""
+An image in format [B, H, W, C] where B is the batch size, C is the number of channels,
+"""
+
+MaskInput = torch.Tensor
+"""
+A mask in format [B, H, W] where B is the batch size
+"""
+
+class AudioInput(TypedDict):
+    """
+    TypedDict representing audio input.
+    """
+
+    waveform: torch.Tensor
+    """
+    Tensor in the format [B, C, T] where B is the batch size, C is the number of channels,
+    """
+
+    sample_rate: int
+
+class LatentInput(TypedDict):
+    """
+    TypedDict representing latent input.
+    """
+
+    samples: torch.Tensor
+    """
+    Tensor in the format [B, C, H, W] where B is the batch size, C is the number of channels,
+    H is the height, and W is the width.
+    """
+
+    noise_mask: Optional[MaskInput]
+    """
+    Optional noise mask tensor in the same format as samples.
+    """
+
+    batch_index: Optional[List[int]]
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+import io
+import av
+from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
+
+class VideoInput(ABC):
+    """
+    Abstract base class for video input types.
+    """
+
+    @abstractmethod
+    def get_components(self) -> VideoComponents:
+        """
+        Abstract method to get the video components (images, audio, and frame rate).
+
+        Returns:
+            VideoComponents containing images, audio, and frame rate
+        """
+        pass
+
+    @abstractmethod
+    def save_to(
+        self,
+        path: str,
+        format: VideoContainer = VideoContainer.AUTO,
+        codec: VideoCodec = VideoCodec.AUTO,
+        metadata: Optional[dict] = None
+    ):
+        """
+        Abstract method to save the video input to a file.
+        """
+        pass
+
+    def get_stream_source(self) -> Union[str, io.BytesIO]:
+        """
+        Get a streamable source for the video. This allows processing without
+        loading the entire video into memory.
+
+        Returns:
+            Either a file path (str) or a BytesIO object that can be opened with av.
+
+        Default implementation creates a BytesIO buffer, but subclasses should
+        override this for better performance when possible.
+        """
+        buffer = io.BytesIO()
+        self.save_to(buffer)
+        buffer.seek(0)
+        return buffer
+
+    # Provide a default implementation, but subclasses can provide optimized versions
+    # if possible.
+    def get_dimensions(self) -> tuple[int, int]:
+        """
+        Returns the dimensions of the video input.
+
+        Returns:
+            Tuple of (width, height)
+        """
+        components = self.get_components()
+        return components.images.shape[2], components.images.shape[1]
+
+    def get_duration(self) -> float:
+        """
+        Returns the duration of the video in seconds.
+
+        Returns:
+            Duration in seconds
+        """
+        components = self.get_components()
+        frame_count = components.images.shape[0]
+        return float(frame_count / components.frame_rate)
+
+    def get_container_format(self) -> str:
+        """
+        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
+
+        Returns:
+            Container format as string
+        """
+        # Default implementation - subclasses should override for better performance
+        source = self.get_stream_source()
+        with av.open(source, mode="r") as container:
+            return container.format.name
--- a/comfy_api/latest/_input_impl/init.py
+++ b/comfy_api/latest/_input_impl/init.py
@@ -0,0 +1,7 @@
+from .video_types import VideoFromFile, VideoFromComponents
+
+__all__ = [
+    # Implementations
+    "VideoFromFile",
+    "VideoFromComponents",
+]
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -0,0 +1,324 @@
+from __future__ import annotations
+from av.container import InputContainer
+from av.subtitles.stream import SubtitleStream
+from fractions import Fraction
+from typing import Optional
+from comfy_api.latest._input import AudioInput, VideoInput
+import av
+import io
+import json
+import numpy as np
+import torch
+from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
+
+
+def container_to_output_format(container_format: str | None) -> str | None:
+    """
+    A container's `format` may be a comma-separated list of formats.
+    E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`.
+    However, writing to a file/stream with `av.open` requires a single format,
+    or `None` to auto-detect.
+    """
+    if not container_format:
+        return None  # Auto-detect
+
+    if "," not in container_format:
+        return container_format
+
+    formats = container_format.split(",")
+    return formats[0]
+
+
+def get_open_write_kwargs(
+    dest: str | io.BytesIO, container_format: str, to_format: str | None
+) -> dict:
+    """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`"""
+    open_kwargs = {
+        "mode": "w",
+        # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo)
+        "options": {"movflags": "use_metadata_tags"},
+    }
+
+    is_write_to_buffer = isinstance(dest, io.BytesIO)
+    if is_write_to_buffer:
+        # Set output format explicitly, since it cannot be inferred from file extension
+        if to_format == VideoContainer.AUTO:
+            to_format = container_format.lower()
+        elif isinstance(to_format, str):
+            to_format = to_format.lower()
+        open_kwargs["format"] = container_to_output_format(to_format)
+
+    return open_kwargs
+
+
+class VideoFromFile(VideoInput):
+    """
+    Class representing video input from a file.
+    """
+
+    def __init__(self, file: str | io.BytesIO):
+        """
+        Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
+        containing the file contents.
+        """
+        self.__file = file
+
+    def get_stream_source(self) -> str | io.BytesIO:
+        """
+        Return the underlying file source for efficient streaming.
+        This avoids unnecessary memory copies when the source is already a file path.
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+        return self.__file
+
+    def get_dimensions(self) -> tuple[int, int]:
+        """
+        Returns the dimensions of the video input.
+
+        Returns:
+            Tuple of (width, height)
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)  # Reset the BytesIO object to the beginning
+        with av.open(self.__file, mode='r') as container:
+            for stream in container.streams:
+                if stream.type == 'video':
+                    assert isinstance(stream, av.VideoStream)
+                    return stream.width, stream.height
+        raise ValueError(f"No video stream found in file '{self.__file}'")
+
+    def get_duration(self) -> float:
+        """
+        Returns the duration of the video in seconds.
+
+        Returns:
+            Duration in seconds
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+        with av.open(self.__file, mode="r") as container:
+            if container.duration is not None:
+                return float(container.duration / av.time_base)
+
+            # Fallback: calculate from frame count and frame rate
+            video_stream = next(
+                (s for s in container.streams if s.type == "video"), None
+            )
+            if video_stream and video_stream.frames and video_stream.average_rate:
+                return float(video_stream.frames / video_stream.average_rate)
+
+            # Last resort: decode frames to count them
+            if video_stream and video_stream.average_rate:
+                frame_count = 0
+                container.seek(0)
+                for packet in container.demux(video_stream):
+                    for _ in packet.decode():
+                        frame_count += 1
+                if frame_count > 0:
+                    return float(frame_count / video_stream.average_rate)
+
+        raise ValueError(f"Could not determine duration for file '{self.__file}'")
+
+    def get_container_format(self) -> str:
+        """
+        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
+
+        Returns:
+            Container format as string
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+        with av.open(self.__file, mode='r') as container:
+            return container.format.name
+
+    def get_components_internal(self, container: InputContainer) -> VideoComponents:
+        # Get video frames
+        frames = []
+        for frame in container.decode(video=0):
+            img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
+            img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
+            frames.append(img)
+
+        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
+
+        # Get frame rate
+        video_stream = next(s for s in container.streams if s.type == 'video')
+        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
+
+        # Get audio if available
+        audio = None
+        try:
+            container.seek(0)  # Reset the container to the beginning
+            for stream in container.streams:
+                if stream.type != 'audio':
+                    continue
+                assert isinstance(stream, av.AudioStream)
+                audio_frames = []
+                for packet in container.demux(stream):
+                    for frame in packet.decode():
+                        assert isinstance(frame, av.AudioFrame)
+                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
+                if len(audio_frames) > 0:
+                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
+                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
+                    audio = AudioInput({
+                        "waveform": audio_tensor,
+                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
+                    })
+        except StopIteration:
+            pass  # No audio stream
+
+        metadata = container.metadata
+        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
+
+    def get_components(self) -> VideoComponents:
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)  # Reset the BytesIO object to the beginning
+        with av.open(self.__file, mode='r') as container:
+            return self.get_components_internal(container)
+        raise ValueError(f"No video stream found in file '{self.__file}'")
+
+    def save_to(
+        self,
+        path: str | io.BytesIO,
+        format: VideoContainer = VideoContainer.AUTO,
+        codec: VideoCodec = VideoCodec.AUTO,
+        metadata: Optional[dict] = None
+    ):
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)  # Reset the BytesIO object to the beginning
+        with av.open(self.__file, mode='r') as container:
+            container_format = container.format.name
+            video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None
+            reuse_streams = True
+            if format != VideoContainer.AUTO and format not in container_format.split(","):
+                reuse_streams = False
+            if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
+                reuse_streams = False
+
+            if not reuse_streams:
+                components = self.get_components_internal(container)
+                video = VideoFromComponents(components)
+                return video.save_to(
+                    path,
+                    format=format,
+                    codec=codec,
+                    metadata=metadata
+                )
+
+            streams = container.streams
+
+            open_kwargs = get_open_write_kwargs(path, container_format, format)
+            with av.open(path, **open_kwargs) as output_container:
+                # Copy over the original metadata
+                for key, value in container.metadata.items():
+                    if metadata is None or key not in metadata:
+                        output_container.metadata[key] = value
+
+                # Add our new metadata
+                if metadata is not None:
+                    for key, value in metadata.items():
+                        if isinstance(value, str):
+                            output_container.metadata[key] = value
+                        else:
+                            output_container.metadata[key] = json.dumps(value)
+
+                # Add streams to the new container
+                stream_map = {}
+                for stream in streams:
+                    if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)):
+                        out_stream = output_container.add_stream_from_template(template=stream, opaque=True)
+                        stream_map[stream] = out_stream
+
+                # Write packets to the new container
+                for packet in container.demux():
+                    if packet.stream in stream_map and packet.dts is not None:
+                        packet.stream = stream_map[packet.stream]
+                        output_container.mux(packet)
+
+class VideoFromComponents(VideoInput):
+    """
+    Class representing video input from tensors.
+    """
+
+    def __init__(self, components: VideoComponents):
+        self.__components = components
+
+    def get_components(self) -> VideoComponents:
+        return VideoComponents(
+            images=self.__components.images,
+            audio=self.__components.audio,
+            frame_rate=self.__components.frame_rate
+        )
+
+    def save_to(
+        self,
+        path: str,
+        format: VideoContainer = VideoContainer.AUTO,
+        codec: VideoCodec = VideoCodec.AUTO,
+        metadata: Optional[dict] = None
+    ):
+        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
+            raise ValueError("Only MP4 format is supported for now")
+        if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
+            raise ValueError("Only H264 codec is supported for now")
+        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
+            # Add metadata before writing any streams
+            if metadata is not None:
+                for key, value in metadata.items():
+                    output.metadata[key] = json.dumps(value)
+
+            frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
+            # Create a video stream
+            video_stream = output.add_stream('h264', rate=frame_rate)
+            video_stream.width = self.__components.images.shape[2]
+            video_stream.height = self.__components.images.shape[1]
+            video_stream.pix_fmt = 'yuv420p'
+
+            # Create an audio stream
+            audio_sample_rate = 1
+            audio_stream: Optional[av.AudioStream] = None
+            if self.__components.audio:
+                audio_sample_rate = int(self.__components.audio['sample_rate'])
+                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
+                audio_stream.sample_rate = audio_sample_rate
+                audio_stream.format = 'fltp'
+
+            # Encode video
+            for i, frame in enumerate(self.__components.images):
+                img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
+                frame = av.VideoFrame.from_ndarray(img, format='rgb24')
+                frame = frame.reformat(format='yuv420p')  # Convert to YUV420P as required by h264
+                packet = video_stream.encode(frame)
+                output.mux(packet)
+
+            # Flush video
+            packet = video_stream.encode(None)
+            output.mux(packet)
+
+            if audio_stream and self.__components.audio:
+                # Encode audio
+                samples_per_frame = int(audio_sample_rate / frame_rate)
+                num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame
+                for i in range(num_frames):
+                    start = i * samples_per_frame
+                    end = start + samples_per_frame
+                    # TODO(Feature) - Add support for stereo audio
+                    chunk = (
+                        self.__components.audio["waveform"][0, 0, start:end]
+                        .unsqueeze(0)
+                        .contiguous()
+                        .numpy()
+                    )
+                    audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
+                    audio_frame.sample_rate = audio_sample_rate
+                    audio_frame.pts = i * samples_per_frame
+                    for packet in audio_stream.encode(audio_frame):
+                        output.mux(packet)
+
+                # Flush audio
+                for packet in audio_stream.encode(None):
+                    output.mux(packet)
+
+
--- a/comfy_api/latest/_util/init.py
+++ b/comfy_api/latest/_util/init.py
@@ -0,0 +1,8 @@
+from .video_types import VideoContainer, VideoCodec, VideoComponents
+
+__all__ = [
+    # Utility Types
+    "VideoContainer",
+    "VideoCodec",
+    "VideoComponents",
+]
--- a/comfy_api/latest/_util/video_types.py
+++ b/comfy_api/latest/_util/video_types.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from enum import Enum
+from fractions import Fraction
+from typing import Optional
+from comfy_api.latest._input import ImageInput, AudioInput
+
+class VideoCodec(str, Enum):
+    AUTO = "auto"
+    H264 = "h264"
+
+    @classmethod
+    def as_input(cls) -> list[str]:
+        """
+        Returns a list of codec names that can be used as node input.
+        """
+        return [member.value for member in cls]
+
+class VideoContainer(str, Enum):
+    AUTO = "auto"
+    MP4 = "mp4"
+
+    @classmethod
+    def as_input(cls) -> list[str]:
+        """
+        Returns a list of container names that can be used as node input.
+        """
+        return [member.value for member in cls]
+
+    @classmethod
+    def get_extension(cls, value) -> str:
+        """
+        Returns the file extension for the container.
+        """
+        if isinstance(value, str):
+            value = cls(value)
+        if value == VideoContainer.MP4 or value == VideoContainer.AUTO:
+            return "mp4"
+        return ""
+
+@dataclass
+class VideoComponents:
+    """
+    Dataclass representing the components of a video.
+    """
+
+    images: ImageInput
+    frame_rate: Fraction
+    audio: Optional[AudioInput] = None
+    metadata: Optional[dict] = None
+
+
--- a/comfy_api/latest/generated/ComfyAPISyncStub.pyi
+++ b/comfy_api/latest/generated/ComfyAPISyncStub.pyi
@@ -0,0 +1,20 @@
+from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple
+from comfy_api.latest import ComfyAPI_latest
+from PIL.Image import Image
+from torch import Tensor
+class ComfyAPISyncStub:
+    def __init__(self) -> None: ...
+
+    class ExecutionSync:
+        def __init__(self) -> None: ...
+        """
+        Update the progress bar displayed in the ComfyUI interface.
+
+        This function allows custom nodes and API calls to report their progress
+        back to the user interface, providing visual feedback during long operations.
+
+        Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
+        """
+        def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ...
+
+    execution: ExecutionSync