Defined TypedDict hints for Latent, Conditioning, and Audio types

2025-07-27 16:26:39 +00:00 · 2025-06-27 16:57:55 -07:00 · 2025-06-27 16:57:55 -07:00 · d0c077423a
commit d0c077423a
parent ba857bd8a0
1 changed files with 129 additions and 4 deletions
--- a/comfy_api/v3/io.py
+++ b/comfy_api/v3/io.py
@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Any, Literal, TYPE_CHECKING, TypeVar, Callable, Optional, cast
+from typing import Any, Literal, TYPE_CHECKING, TypeVar, Callable, Optional, cast, TypedDict, NotRequired
 from enum import Enum
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, asdict
@ -17,6 +17,7 @@ from comfy.sd import StyleModel as StyleModel_
 from comfy.clip_vision import ClipVisionModel
 from comfy.clip_vision import Output as ClipVisionOutput_
 from comfy_api.input import VideoInput
+from comfy.hooks import HookGroup


 class FolderType(str, Enum):
@ -448,11 +449,132 @@ class Mask(ComfyTypeIO):

@comfytype(io_type=IO.LATENT)
 class Latent(ComfyTypeIO):
-    Type = Any # TODO: make Type a TypedDict
+    '''Latents are stored as a dictionary.'''
+    class LatentDict(TypedDict):
+        samples: torch.Tensor
+        '''Latent tensors.'''
+        noise_mask: NotRequired[torch.Tensor]
+        batch_index: NotRequired[list[int]]
+        type: NotRequired[str]
+        '''Only needed if dealing with these types: audio, hunyuan3dv2'''
+    Type = LatentDict

@comfytype(io_type=IO.CONDITIONING)
 class Conditioning(ComfyTypeIO):
-    Type = Any # TODO: make Type a TypedDict
+    class PooledDict(TypedDict):
+        pooled_output: torch.Tensor
+        '''Pooled output from CLIP.'''
+        control: NotRequired[ControlNet]
+        '''ControlNet to apply to conditioning.'''
+        control_apply_to_uncond: NotRequired[bool]
+        '''Whether to apply ControlNet to matching negative conditioning at sample time, if applicable.'''
+        cross_attn_controlnet: NotRequired[torch.Tensor]
+        '''CrossAttn from CLIP to use for controlnet only.'''
+        pooled_output_controlnet: NotRequired[torch.Tensor]
+        '''Pooled output from CLIP to use for controlnet only.'''
+        gligen: NotRequired[tuple[str, Gligen, list[tuple[torch.Tensor, int, ...]]]]
+        '''GLIGEN to apply to conditioning.'''
+        area: NotRequired[tuple[int, ...] | tuple[str, float, ...]]
+        '''Set area of conditioning. First half of values apply to dimensions, the second half apply to coordinates.
+        By default, the dimensions are based on total pixel amount, but the first value can be set to "percentage" to use a percentage of the image size instead.
+
+        (1024, 1024, 0, 0) would apply conditioning to the top-left 1024x1024 pixels.
+        
+        ("percentage", 0.5, 0.5, 0, 0) would apply conditioning to the top-left 50% of the image.''' # TODO: verify its actually top-left
+        strength: NotRequired[float]
+        '''Strength of conditioning. Default strength is 1.0.'''
+        mask: NotRequired[torch.Tensor]
+        '''Mask to apply conditioning to.'''
+        mask_strength: NotRequired[float]
+        '''Strength of conditioning mask. Default strength is 1.0.'''
+        set_area_to_bounds: NotRequired[bool]
+        '''Whether conditioning mask should determine bounds of area - if set to false, latents are sampled at full resolution and result is applied in mask.'''
+        concat_latent_image: NotRequired[torch.Tensor]
+        '''Used for inpainting and specific models.'''
+        concat_mask: NotRequired[torch.Tensor]
+        '''Used for inpainting and specific models.'''
+        concat_image: NotRequired[torch.Tensor]
+        '''Used by SD_4XUpscale_Conditioning.'''
+        noise_augmentation: NotRequired[float]
+        '''Used by SD_4XUpscale_Conditioning.'''
+        hooks: NotRequired[HookGroup]
+        '''Applies hooks to conditioning.'''
+        default: NotRequired[bool]
+        '''Whether to this conditioning is 'default'; default conditioning gets applied to any areas of the image that have no masks/areas applied, assuming at least one area/mask is present during sampling.'''
+        start_percent: NotRequired[float]
+        '''Determines relative step to begin applying conditioning, expressed as a float between 0.0 and 1.0.'''
+        end_percent: NotRequired[float]
+        '''Determines relative step to end applying conditioning, expressed as a float between 0.0 and 1.0.'''
+        clip_start_percent: NotRequired[float]
+        '''Internal variable for conditioning scheduling - start of application, expressed as a float between 0.0 and 1.0.'''
+        clip_end_percent: NotRequired[float]
+        '''Internal variable for conditioning scheduling - end of application, expressed as a float between 0.0 and 1.0.'''
+        attention_mask: NotRequired[torch.Tensor]
+        '''Used by StyleModel.'''
+        attention_mask_img_shape: NotRequired[tuple[int, ...]]
+        '''Used by StyleModel.'''
+        unclip_conditioning: NotRequired[list[dict]]
+        '''Used by unCLIP.'''
+        conditioning_lyrics: NotRequired[torch.Tensor]
+        seconds_start: NotRequired[float]
+        '''Used by StableAudio.'''
+        seconds_total: NotRequired[float]
+        '''Used by StableAudio.'''
+        lyrics_strength: NotRequired[float]
+        '''Used by AceStepAudio.'''
+        width: NotRequired[int]
+        '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).'''
+        height: NotRequired[int]
+        '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).'''
+        aesthetic_score: NotRequired[float]
+        '''Used by CLIPTextEncodeSDXL/Refiner.'''
+        crop_w: NotRequired[int]
+        '''Used by CLIPTextEncodeSDXL.'''
+        crop_h: NotRequired[int]
+        '''Used by CLIPTextEncodeSDXL.'''
+        target_width: NotRequired[int]
+        '''Used by CLIPTextEncodeSDXL.'''
+        target_height: NotRequired[int]
+        '''Used by CLIPTextEncodeSDXL.'''
+        reference_latents: NotRequired[list[torch.Tensor]]
+        '''Used by ReferenceLatent.'''
+        guidance: NotRequired[float]
+        '''Used by Flux-like models with guidance embed.'''
+        guiding_frame_index: NotRequired[int]
+        '''Used by Hunyuan ImageToVideo.'''
+        ref_latent: NotRequired[torch.Tensor]
+        '''Used by Hunyuan ImageToVideo.'''
+        keyframe_idxs: NotRequired[list[int]]
+        '''Used by LTXV.'''
+        frame_rate: NotRequired[float]
+        '''Used by LTXV.'''
+        stable_cascade_prior: NotRequired[torch.Tensor]
+        '''Used by StableCascade.'''
+        elevation: NotRequired[list[float]]
+        '''Used by SV3D.'''
+        azimuth: NotRequired[list[float]]
+        '''Used by SV3D.'''
+        motion_bucket_id: NotRequired[int]
+        '''Used by SVD-like models.'''
+        fps: NotRequired[int]
+        '''Used by SVD-like models.'''
+        augmentation_level: NotRequired[float]
+        '''Used by SVD-like models.'''
+        clip_vision_output: NotRequired[ClipVisionOutput_]
+        '''Used by WAN-like models.'''
+        vace_frames: NotRequired[torch.Tensor]
+        '''Used by WAN VACE.'''
+        vace_mask: NotRequired[torch.Tensor]
+        '''Used by WAN VACE.'''
+        vace_strength: NotRequired[float]
+        '''Used by WAN VACE.'''
+        camera_conditions: NotRequired[Any] # TODO: assign proper type once defined
+        '''Used by WAN Camera.'''
+        time_dim_concat: NotRequired[torch.Tensor]
+        '''Used by WAN Phantom Subject.'''
+
+    CondList = list[tuple[torch.Tensor, PooledDict]]
+    Type = CondList

@comfytype(io_type=IO.SAMPLER)
 class Sampler(ComfyTypeIO):
@ -509,7 +631,10 @@ class UpscaleModel(ComfyTypeIO):

@comfytype(io_type=IO.AUDIO)
 class Audio(ComfyTypeIO):
-    Type = Any # TODO: make Type a TypedDict
+    class AudioDict(TypedDict):
+        waveform: torch.Tensor
+        sampler_rate: int
+    Type = AudioDict

@comfytype(io_type=IO.POINT)
 class Point(ComfyTypeIO):