Merge remote-tracking branch 'origin/master' into pysssss-model-db

2025-08-05 20:56:38 +00:00 · 2025-08-03 16:36:49 +01:00 · 2025-08-03 16:36:49 +01:00 · 54cf14cbbb
commit 54cf14cbbb
parent 7d5160f92c aebac22193
157 changed files with 166127 additions and 1821 deletions
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@ -4,6 +4,9 @@ if you have a NVIDIA gpu:
 run_nvidia_gpu.bat
 if you want to enable the fast fp16 accumulation (faster for fp16 models with slightly less quality):
 run_nvidia_gpu_fast_fp16_accumulation.bat
 To run it in slow CPU mode:
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -15,6 +15,14 @@ body:
        steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.
        If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
  - type: checkboxes
    id: custom-nodes-test
    attributes:
      label: Custom Node Testing
      description: Please confirm you have tried to reproduce the issue with all custom nodes disabled.
      options:
        - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help)
          required: true
  - type: textarea
    attributes:
      label: Expected Behavior
--- a/.github/ISSUE_TEMPLATE/user-support.yml
+++ b/.github/ISSUE_TEMPLATE/user-support.yml
@ -11,6 +11,14 @@ body:
            **2:** You have made an effort to find public answers to your question before asking here. In other words, you googled it first, and scrolled through recent help topics.
                If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
    - type: checkboxes
      id: custom-nodes-test
      attributes:
        label: Custom Node Testing
        description: Please confirm you have tried to reproduce the issue with all custom nodes disabled.
        options:
          - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help)
            required: true
    - type: textarea
      attributes:
            label: Your question
--- a/.github/workflows/check-line-endings.yml
+++ b/.github/workflows/check-line-endings.yml
@ -0,0 +1,40 @@
 name: Check for Windows Line Endings
 on:
  pull_request:
    branches: ['*'] # Trigger on all pull requests to any branch
 jobs:
  check-line-endings:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0 # Fetch all history to compare changes
      - name: Check for Windows line endings (CRLF)
        run: |
          # Get the list of changed files in the PR
          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
          # Flag to track if CRLF is found
          CRLF_FOUND=false
          # Loop through each changed file
          for FILE in $CHANGED_FILES; do
            # Check if the file exists and is a text file
            if [ -f "$FILE" ] && file "$FILE" | grep -q "text"; then
              # Check for CRLF line endings
              if grep -UP '\r$' "$FILE"; then
                echo "Error: Windows line endings (CRLF) detected in $FILE"
                CRLF_FOUND=true
              fi
            fi
          done
          # Exit with error if CRLF was found
          if [ "$CRLF_FOUND" = true ]; then
            exit 1
          fi
--- a/.github/workflows/release-webhook.yml
+++ b/.github/workflows/release-webhook.yml
@ -0,0 +1,108 @@
 name: Release Webhook
 on:
  release:
    types: [published]
 jobs:
  send-webhook:
    runs-on: ubuntu-latest
    steps:
      - name: Send release webhook
        env:
          WEBHOOK_URL: ${{ secrets.RELEASE_GITHUB_WEBHOOK_URL }}
          WEBHOOK_SECRET: ${{ secrets.RELEASE_GITHUB_WEBHOOK_SECRET }}
        run: |
          # Generate UUID for delivery ID
          DELIVERY_ID=$(uuidgen)
          HOOK_ID="release-webhook-$(date +%s)"
          # Create webhook payload matching GitHub release webhook format
          PAYLOAD=$(cat <<EOF
          {
            "action": "published",
            "release": {
              "id": ${{ github.event.release.id }},
              "node_id": "${{ github.event.release.node_id }}",
              "url": "${{ github.event.release.url }}",
              "html_url": "${{ github.event.release.html_url }}",
              "assets_url": "${{ github.event.release.assets_url }}",
              "upload_url": "${{ github.event.release.upload_url }}",
              "tag_name": "${{ github.event.release.tag_name }}",
              "target_commitish": "${{ github.event.release.target_commitish }}",
              "name": ${{ toJSON(github.event.release.name) }},
              "body": ${{ toJSON(github.event.release.body) }},
              "draft": ${{ github.event.release.draft }},
              "prerelease": ${{ github.event.release.prerelease }},
              "created_at": "${{ github.event.release.created_at }}",
              "published_at": "${{ github.event.release.published_at }}",
              "author": {
                "login": "${{ github.event.release.author.login }}",
                "id": ${{ github.event.release.author.id }},
                "node_id": "${{ github.event.release.author.node_id }}",
                "avatar_url": "${{ github.event.release.author.avatar_url }}",
                "url": "${{ github.event.release.author.url }}",
                "html_url": "${{ github.event.release.author.html_url }}",
                "type": "${{ github.event.release.author.type }}",
                "site_admin": ${{ github.event.release.author.site_admin }}
              },
              "tarball_url": "${{ github.event.release.tarball_url }}",
              "zipball_url": "${{ github.event.release.zipball_url }}",
              "assets": ${{ toJSON(github.event.release.assets) }}
            },
            "repository": {
              "id": ${{ github.event.repository.id }},
              "node_id": "${{ github.event.repository.node_id }}",
              "name": "${{ github.event.repository.name }}",
              "full_name": "${{ github.event.repository.full_name }}",
              "private": ${{ github.event.repository.private }},
              "owner": {
                "login": "${{ github.event.repository.owner.login }}",
                "id": ${{ github.event.repository.owner.id }},
                "node_id": "${{ github.event.repository.owner.node_id }}",
                "avatar_url": "${{ github.event.repository.owner.avatar_url }}",
                "url": "${{ github.event.repository.owner.url }}",
                "html_url": "${{ github.event.repository.owner.html_url }}",
                "type": "${{ github.event.repository.owner.type }}",
                "site_admin": ${{ github.event.repository.owner.site_admin }}
              },
              "html_url": "${{ github.event.repository.html_url }}",
              "clone_url": "${{ github.event.repository.clone_url }}",
              "git_url": "${{ github.event.repository.git_url }}",
              "ssh_url": "${{ github.event.repository.ssh_url }}",
              "url": "${{ github.event.repository.url }}",
              "created_at": "${{ github.event.repository.created_at }}",
              "updated_at": "${{ github.event.repository.updated_at }}",
              "pushed_at": "${{ github.event.repository.pushed_at }}",
              "default_branch": "${{ github.event.repository.default_branch }}",
              "fork": ${{ github.event.repository.fork }}
            },
            "sender": {
              "login": "${{ github.event.sender.login }}",
              "id": ${{ github.event.sender.id }},
              "node_id": "${{ github.event.sender.node_id }}",
              "avatar_url": "${{ github.event.sender.avatar_url }}",
              "url": "${{ github.event.sender.url }}",
              "html_url": "${{ github.event.sender.html_url }}",
              "type": "${{ github.event.sender.type }}",
              "site_admin": ${{ github.event.sender.site_admin }}
            }
          }
          EOF
          )
          # Generate HMAC-SHA256 signature
          SIGNATURE=$(echo -n "$PAYLOAD" | openssl dgst -sha256 -hmac "$WEBHOOK_SECRET" -hex | cut -d' ' -f2)
          # Send webhook with required headers
          curl -X POST "$WEBHOOK_URL" \
            -H "Content-Type: application/json" \
            -H "X-GitHub-Event: release" \
            -H "X-GitHub-Delivery: $DELIVERY_ID" \
            -H "X-GitHub-Hook-ID: $HOOK_ID" \
            -H "X-Hub-Signature-256: sha256=$SIGNATURE" \
            -H "User-Agent: GitHub-Actions-Webhook/1.0" \
            -d "$PAYLOAD" \
            --fail --silent --show-error
          echo "✅ Release webhook sent successfully"
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -102,5 +102,4 @@ jobs:
          file: ComfyUI_windows_portable_nvidia.7z
          tag: ${{ inputs.git_tag }}
          overwrite: true
-          prerelease: true
+          draft: true
          make_latest: false
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "129"
      python_minor:
        description: 'python minor version'
@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "2"
+        default: "5"
 #  push:
 #    branches:
 #      - master
@ -53,6 +53,8 @@ jobs:
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
            cd ..
            git clone --depth 1 https://github.com/comfyanonymous/taesd
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![Website][website-shield]][website-url]
 [![Dynamic JSON Badge][discord-shield]][discord-url]
 [![Twitter][twitter-shield]][twitter-url]
 [![Matrix][matrix-shield]][matrix-url]
 <br>
 [![][github-release-shield]][github-release-link]
@ -20,6 +21,8 @@
 <!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
 [discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
 [discord-url]: https://www.comfy.org/discord
 [twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
 [twitter-url]: https://x.com/ComfyUI
 [github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
 [github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
@ -52,7 +55,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Image Models
-   - SD1.x, SD2.x,
+   - SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/))
   - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
   - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
   - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
@ -62,13 +65,19 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
   - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
 - Image Editing Models
   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
   - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
 - Video Models
   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
-   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
+   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
   - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
 - Audio Models
   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
@ -76,9 +85,10 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
+- Smart memory management: can automatically run large models on GPUs with as low as 1GB vram with smart offloading.
 - Works even if you don't have a GPU with: ```--cpu``` (slow)
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models.
+- Can load ckpt and safetensors: All in one checkpoints or standalone diffusion models, VAEs and CLIP models.
 - Safe loading of ckpt, pt, pth, etc.. files.
 - Embeddings/Textual inversion
 - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
 - [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/)
@ -89,20 +99,19 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 - [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models.
 - [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/)
 - [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/)
 - [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
- Starts up very fast.
+- Works fully offline: core will never download anything unless you want to.
- Works fully offline: will never download anything.
+- Optional API nodes to use paid models from external providers through the online [Comfy API](https://docs.comfy.org/tutorials/api-nodes/overview).
 - [Config file](extra_model_paths.yaml.example) to set the search paths for models.
 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)
 ## Release Process
-ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories:
+ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
   - Releases a new stable version (e.g., v0.7.0)
@ -170,10 +179,6 @@ If you have trouble extracting it, right click the file -> properties -> unblock
 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
 ## Jupyter Notebook
 To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)
 ## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
@ -235,7 +240,7 @@ Nvidia users should install stable pytorch using this command:
 This is the command to install pytorch nightly instead which might have performance improvements.
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129```
 #### Troubleshooting
@ -268,6 +273,8 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve
 #### DirectML (AMD Cards on Windows)
 This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out.
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
 #### Ascend NPUs
@ -287,6 +294,13 @@ For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a
 2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
 3. Launch ComfyUI by running `python main.py`
 #### Iluvatar Corex
 For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method:
 1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
 2. Launch ComfyUI by running `python main.py`
 # Running
 ```python main.py```
--- a/alembic_db/env.py
+++ b/alembic_db/env.py
@ -19,15 +19,12 @@ target_metadata = Base.metadata
 def run_migrations_offline() -> None:
    """Run migrations in 'offline' mode.
    This configures the context with just a URL
    and not an Engine, though an Engine is acceptable
    here as well.  By skipping the Engine creation
    we don't even need a DBAPI to be available.
    Calls to context.execute() here emit the given string to the
    script output.
    """
    url = config.get_main_option("sqlalchemy.url")
    context.configure(
@ -43,10 +40,8 @@ def run_migrations_offline() -> None:
 def run_migrations_online() -> None:
    """Run migrations in 'online' mode.
    In this scenario we need to create an Engine
    and associate a connection with the context.
    """
    connectable = engine_from_config(
        config.get_section(config.config_ini_section, {}),
--- a/app/database/db.py
+++ b/app/database/db.py
@ -23,9 +23,7 @@ except ImportError as e:
        f"""
 ------------------------------------------------------------------------
 Error importing dependencies: {e}
 {get_missing_requirements_message()}
 This error is happening because ComfyUI now uses a local sqlite database.
 ------------------------------------------------------------------------
 """.strip()
@ -85,7 +83,9 @@ def init_db():
    script = ScriptDirectory.from_config(config)
    target_rev = script.get_current_head()
-    if current_rev != target_rev:
+    if target_rev is None:
        logging.warning("No target revision found.")
    elif current_rev != target_rev:
        # Backup the database pre upgrade
        backup_path = db_path + ".bkp"
        if db_exists:
@ -101,7 +101,7 @@ def init_db():
                # Restore the database from backup if upgrade fails
                shutil.copy(backup_path, db_path)
                os.remove(backup_path)
-            logging.error(f"Error upgrading database: {e}")
+            logging.exception("Error upgrading database: ")
            raise e
    global Session
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -17,9 +17,11 @@ import requests
 from typing_extensions import NotRequired
 from utils.install_util import get_missing_requirements_message, requirements_path
 from comfy.cli_args import DEFAULT_VERSION_STRING
 import app.logger
 def frontend_install_warning_message():
    return f"""
 {get_missing_requirements_message()}
@ -27,18 +29,48 @@ def frontend_install_warning_message():
 This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
 """.strip()
 def parse_version(version: str) -> tuple[int, int, int]:
        return tuple(map(int, version.split(".")))
 def is_valid_version(version: str) -> bool:
    """Validate if a string is a valid semantic version (X.Y.Z format)."""
    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
    return bool(re.match(pattern, version))
 def get_installed_frontend_version():
    """Get the currently installed frontend package version."""
    frontend_version_str = version("comfyui-frontend-package")
    return frontend_version_str
 def get_required_frontend_version():
    """Get the required frontend version from requirements.txt."""
    try:
        with open(requirements_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("comfyui-frontend-package=="):
                    version_str = line.split("==")[-1]
                    if not is_valid_version(version_str):
                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
                        return None
                    return version_str
            logging.error("comfyui-frontend-package not found in requirements.txt")
            return None
    except FileNotFoundError:
        logging.error("requirements.txt not found. Cannot determine required frontend version.")
        return None
    except Exception as e:
        logging.error(f"Error reading requirements.txt: {e}")
        return None
 def check_frontend_version():
    """Check if the frontend version is up to date."""
    def parse_version(version: str) -> tuple[int, int, int]:
        return tuple(map(int, version.split(".")))
    try:
-        frontend_version_str = version("comfyui-frontend-package")
+        frontend_version_str = get_installed_frontend_version()
        frontend_version = parse_version(frontend_version_str)
-        with open(requirements_path, "r", encoding="utf-8") as f:
+        required_frontend_str = get_required_frontend_version()
-            required_frontend = parse_version(f.readline().split("=")[-1])
+        required_frontend = parse_version(required_frontend_str)
        if frontend_version < required_frontend:
            app.logger.log_startup_warning(
                f"""
@ -110,9 +142,22 @@ class FrontEndProvider:
        response.raise_for_status()  # Raises an HTTPError if the response was an error
        return response.json()
    @cached_property
    def latest_prerelease(self) -> Release:
        """Get the latest pre-release version - even if it's older than the latest release"""
        release = [release for release in self.all_releases if release["prerelease"]]
        if not release:
            raise ValueError("No pre-releases found")
        # GitHub returns releases in reverse chronological order, so first is latest
        return release[0]
    def get_release(self, version: str) -> Release:
        if version == "latest":
            return self.latest_release
        elif version == "prerelease":
            return self.latest_prerelease
        else:
            for release in self.all_releases:
                if release["tag_name"] in [version, f"v{version}"]:
@ -164,6 +209,11 @@ class FrontendManager:
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
    @classmethod
    def get_required_frontend_version(cls) -> str:
        """Get the required frontend package version."""
        return get_required_frontend_version()
    @classmethod
    def default_frontend_path(cls) -> str:
        """
@ -253,7 +303,7 @@ comfyui-workflow-templates is not installed.
        Raises:
            argparse.ArgumentTypeError: If the version string is invalid.
        """
-        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$"
+        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+[-._a-zA-Z0-9]*|latest|prerelease)$"
        match_result = re.match(VERSION_PATTERN, value)
        if match_result is None:
            raise argparse.ArgumentTypeError(f"Invalid version string: {value}")
--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -130,10 +130,21 @@ class ModelFileManager:
            for file_name in filenames:
                try:
-                    relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory)
+                    full_path = os.path.join(dirpath, file_name)
-                    result.append(relative_path)
+                    relative_path = os.path.relpath(full_path, directory)
-                except:
+
-                    logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.")
+                    # Get file metadata
                    file_info = {
                        "name": relative_path,
                        "pathIndex": pathIndex,
                        "modified": os.path.getmtime(full_path),  # Add modification time
                        "created": os.path.getctime(full_path),   # Add creation time
                        "size": os.path.getsize(full_path)        # Add file size
                    }
                    result.append(file_info)
                except Exception as e:
                    logging.warning(f"Warning: Unable to access {file_name}. Error: {e}. Skipping this file.")
                    continue
            for d in subdirs:
@ -144,7 +155,7 @@ class ModelFileManager:
                    logging.warning(f"Warning: Unable to access {path}. Skipping this path.")
                    continue
-        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()
+        return result, dirs, time.perf_counter()
    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
        dirname = os.path.dirname(filepath)
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -20,13 +20,15 @@ class FileInfo(TypedDict):
    path: str
    size: int
    modified: int
    created: int
 def get_file_info(path: str, relative_to: str) -> FileInfo:
    return {
        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
        "size": os.path.getsize(path),
-        "modified": os.path.getmtime(path)
+        "modified": os.path.getmtime(path),
        "created": os.path.getctime(path)
    }
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -49,7 +49,8 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
+parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
 parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
@ -144,6 +145,7 @@ class PerformanceFeature(enum.Enum):
 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@ -151,6 +153,7 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
 parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -37,6 +37,8 @@ class IO(StrEnum):
    CONTROL_NET = "CONTROL_NET"
    VAE = "VAE"
    MODEL = "MODEL"
    LORA_MODEL = "LORA_MODEL"
    LOSS_MAP = "LOSS_MAP"
    CLIP_VISION = "CLIP_VISION"
    CLIP_VISION_OUTPUT = "CLIP_VISION_OUTPUT"
    STYLE_MODEL = "STYLE_MODEL"
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -43,7 +43,6 @@ if TYPE_CHECKING:
 def broadcast_image_to(tensor, target_batch_size, batched_number):
    current_batch_size = tensor.shape[0]
    #print(current_batch_size, target_batch_size)
    if current_batch_size == 1:
        return tensor
@ -390,7 +389,8 @@ class ControlLora(ControlNet):
                pass
        for k in self.control_weights:
-            if k not in {"lora_controlnet"}:
+            if (k not in {"lora_controlnet"}):
                if (k.endswith(".up") or k.endswith(".down") or k.endswith(".weight") or k.endswith(".bias")) and ("__" not in k):
                    comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
    def copy(self):
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
@ -1,55 +1,10 @@
 import math
 import torch
 from torch import nn
-from .ldm.modules.attention import CrossAttention
+from .ldm.modules.attention import CrossAttention, FeedForward
 from inspect import isfunction
 import comfy.ops
 ops = comfy.ops.manual_cast
 def exists(val):
    return val is not None
 def uniq(arr):
    return{el: True for el in arr}.keys()
 def default(val, d):
    if exists(val):
        return val
    return d() if isfunction(d) else d
 # feedforward
 class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.proj = ops.Linear(dim_in, dim_out * 2)
    def forward(self, x):
        x, gate = self.proj(x).chunk(2, dim=-1)
        return x * torch.nn.functional.gelu(gate)
 class FeedForward(nn.Module):
    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
        super().__init__()
        inner_dim = int(dim * mult)
        dim_out = default(dim_out, dim)
        project_in = nn.Sequential(
            ops.Linear(dim, inner_dim),
            nn.GELU()
        ) if not glu else GEGLU(dim, inner_dim)
        self.net = nn.Sequential(
            project_in,
            nn.Dropout(dropout),
            ops.Linear(inner_dim, dim_out)
        )
    def forward(self, x):
        return self.net(x)
 class GatedCrossAttentionDense(nn.Module):
    def __init__(self, query_dim, context_dim, n_heads, d_head):
--- a/comfy/k_diffusion/sa_solver.py
+++ b/comfy/k_diffusion/sa_solver.py
@ -0,0 +1,121 @@
 # SA-Solver: Stochastic Adams Solver (NeurIPS 2023, arXiv:2309.05019)
 # Conference: https://proceedings.neurips.cc/paper_files/paper/2023/file/f4a6806490d31216a3ba667eb240c897-Paper-Conference.pdf
 # Codebase ref: https://github.com/scxue/SA-Solver
 import math
 from typing import Union, Callable
 import torch
 def compute_exponential_coeffs(s: torch.Tensor, t: torch.Tensor, solver_order: int, tau_t: float) -> torch.Tensor:
    """Compute (1 + tau^2) * integral of exp((1 + tau^2) * x) * x^p dx from s to t with exp((1 + tau^2) * t) factored out, using integration by parts.
    Integral of exp((1 + tau^2) * x) * x^p dx
        = product_terms[p] - (p / (1 + tau^2)) * integral of exp((1 + tau^2) * x) * x^(p-1) dx,
    with base case p=0 where integral equals product_terms[0].
    where
        product_terms[p] = x^p * exp((1 + tau^2) * x) / (1 + tau^2).
    Construct a recursive coefficient matrix following the above recursive relation to compute all integral terms up to p = (solver_order - 1).
    Return coefficients used by the SA-Solver in data prediction mode.
    Args:
        s: Start time s.
        t: End time t.
        solver_order: Current order of the solver.
        tau_t: Stochastic strength parameter in the SDE.
    Returns:
        Exponential coefficients used in data prediction, with exp((1 + tau^2) * t) factored out, ordered from p=0 to p=solver_order−1, shape (solver_order,).
    """
    tau_mul = 1 + tau_t ** 2
    h = t - s
    p = torch.arange(solver_order, dtype=s.dtype, device=s.device)
    # product_terms after factoring out exp((1 + tau^2) * t)
    # Includes (1 + tau^2) factor from outside the integral
    product_terms_factored = (t ** p - s ** p * (-tau_mul * h).exp())
    # Lower triangular recursive coefficient matrix
    # Accumulates recursive coefficients based on p / (1 + tau^2)
    recursive_depth_mat = p.unsqueeze(1) - p.unsqueeze(0)
    log_factorial = (p + 1).lgamma()
    recursive_coeff_mat = log_factorial.unsqueeze(1) - log_factorial.unsqueeze(0)
    if tau_t > 0:
        recursive_coeff_mat = recursive_coeff_mat - (recursive_depth_mat * math.log(tau_mul))
    signs = torch.where(recursive_depth_mat % 2 == 0, 1.0, -1.0)
    recursive_coeff_mat = (recursive_coeff_mat.exp() * signs).tril()
    return recursive_coeff_mat @ product_terms_factored
 def compute_simple_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, is_corrector_step: bool = False) -> torch.Tensor:
    """Compute simple order-2 b coefficients from SA-Solver paper (Appendix D. Implementation Details)."""
    tau_mul = 1 + tau_t ** 2
    h = lambda_t - lambda_s
    alpha_t = sigma_next * lambda_t.exp()
    if is_corrector_step:
        # Simplified 1-step (order-2) corrector
        b_1 = alpha_t * (0.5 * tau_mul * h)
        b_2 = alpha_t * (-h * tau_mul).expm1().neg() - b_1
    else:
        # Simplified 2-step predictor
        b_2 = alpha_t * (0.5 * tau_mul * h ** 2) / (curr_lambdas[-2] - lambda_s)
        b_1 = alpha_t * (-h * tau_mul).expm1().neg() - b_2
    return torch.stack([b_2, b_1])
 def compute_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, simple_order_2: bool = False, is_corrector_step: bool = False) -> torch.Tensor:
    """Compute b_i coefficients for the SA-Solver (see eqs. 15 and 18).
    The solver order corresponds to the number of input lambdas (half-logSNR points).
    Args:
        sigma_next: Sigma at end time t.
        curr_lambdas: Lambda time points used to construct the Lagrange basis, shape (N,).
        lambda_s: Lambda at start time s.
        lambda_t: Lambda at end time t.
        tau_t: Stochastic strength parameter in the SDE.
        simple_order_2: Whether to enable the simple order-2 scheme.
        is_corrector_step: Flag for corrector step in simple order-2 mode.
    Returns:
        b_i coefficients for the SA-Solver, shape (N,), where N is the solver order.
    """
    num_timesteps = curr_lambdas.shape[0]
    if simple_order_2 and num_timesteps == 2:
        return compute_simple_stochastic_adams_b_coeffs(sigma_next, curr_lambdas, lambda_s, lambda_t, tau_t, is_corrector_step)
    # Compute coefficients by solving a linear system from Lagrange basis interpolation
    exp_integral_coeffs = compute_exponential_coeffs(lambda_s, lambda_t, num_timesteps, tau_t)
    vandermonde_matrix_T = torch.vander(curr_lambdas, num_timesteps, increasing=True).T
    lagrange_integrals = torch.linalg.solve(vandermonde_matrix_T, exp_integral_coeffs)
    # (sigma_t * exp(-tau^2 * lambda_t)) * exp((1 + tau^2) * lambda_t)
    # = sigma_t * exp(lambda_t) = alpha_t
    # exp((1 + tau^2) * lambda_t) is extracted from the integral
    alpha_t = sigma_next * lambda_t.exp()
    return alpha_t * lagrange_integrals
 def get_tau_interval_func(start_sigma: float, end_sigma: float, eta: float = 1.0) -> Callable[[Union[torch.Tensor, float]], float]:
    """Return a function that controls the stochasticity of SA-Solver.
    When eta = 0, SA-Solver runs as ODE. The official approach uses
    time t to determine the SDE interval, while here we use sigma instead.
    See:
        https://github.com/scxue/SA-Solver/blob/main/README.md
    """
    def tau_func(sigma: Union[torch.Tensor, float]) -> float:
        if eta <= 0:
            return 0.0  # ODE
        if isinstance(sigma, torch.Tensor):
            sigma = sigma.item()
        return eta if start_sigma >= sigma >= end_sigma else 0.0
    return tau_func
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -1,4 +1,5 @@
 import math
 from functools import partial
 from scipy import integrate
 import torch
@ -8,6 +9,7 @@ from tqdm.auto import trange, tqdm
 from . import utils
 from . import deis
 from . import sa_solver
 import comfy.model_patcher
 import comfy.model_sampling
@ -142,6 +144,33 @@ class BrownianTreeNoiseSampler:
        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
 def sigma_to_half_log_snr(sigma, model_sampling):
    """Convert sigma to half-logSNR log(alpha_t / sigma_t)."""
    if isinstance(model_sampling, comfy.model_sampling.CONST):
        # log((1 - t) / t) = log((1 - sigma) / sigma)
        return sigma.logit().neg()
    return sigma.log().neg()
 def half_log_snr_to_sigma(half_log_snr, model_sampling):
    """Convert half-logSNR log(alpha_t / sigma_t) to sigma."""
    if isinstance(model_sampling, comfy.model_sampling.CONST):
        # 1 / (1 + exp(half_log_snr))
        return half_log_snr.neg().sigmoid()
    return half_log_snr.neg().exp()
 def offset_first_sigma_for_snr(sigmas, model_sampling, percent_offset=1e-4):
    """Adjust the first sigma to avoid invalid logSNR."""
    if len(sigmas) <= 1:
        return sigmas
    if isinstance(model_sampling, comfy.model_sampling.CONST):
        if sigmas[0] >= 1:
            sigmas = sigmas.clone()
            sigmas[0] = model_sampling.percent_to_sigma(percent_offset)
    return sigmas
@torch.no_grad()
 def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
    """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
@ -384,6 +413,10 @@ def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, o
            ds.pop(0)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            # Denoising step
            x = denoised
        else:
            cur_order = min(i + 1, order)
            coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
@ -682,6 +715,7 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
    return x
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
@ -693,38 +727,49 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
+
-    t_fn = lambda sigma: sigma.log().neg()
+    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
-            # Euler method
+            # Denoising step
-            d = to_d(x, sigmas[i], denoised)
+            x = denoised
            dt = sigmas[i + 1] - sigmas[i]
            x = x + d * dt
        else:
            # DPM-Solver++
-            t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
+            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = t_next - t
+            h = lambda_t - lambda_s
-            s = t + h * r
+            lambda_s_1 = lambda_s + r * h
            fac = 1 / (2 * r)
            sigma_s_1 = sigma_fn(lambda_s_1)
            alpha_s = sigmas[i] * lambda_s.exp()
            alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
            alpha_t = sigmas[i + 1] * lambda_t.exp()
            # Step 1
-            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(s), eta)
+            sd, su = get_ancestral_step(lambda_s.neg().exp(), lambda_s_1.neg().exp(), eta)
-            s_ = t_fn(sd)
+            lambda_s_1_ = sd.log().neg()
-            x_2 = (sigma_fn(s_) / sigma_fn(t)) * x - (t - s_).expm1() * denoised
+            h_ = lambda_s_1_ - lambda_s
-            x_2 = x_2 + noise_sampler(sigma_fn(t), sigma_fn(s)) * s_noise * su
+            x_2 = (alpha_s_1 / alpha_s) * (-h_).exp() * x - alpha_s_1 * (-h_).expm1() * denoised
-            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
+            if eta > 0 and s_noise > 0:
                x_2 = x_2 + alpha_s_1 * noise_sampler(sigmas[i], sigma_s_1) * s_noise * su
            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
            # Step 2
-            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(t_next), eta)
+            sd, su = get_ancestral_step(lambda_s.neg().exp(), lambda_t.neg().exp(), eta)
-            t_next_ = t_fn(sd)
+            lambda_t_ = sd.log().neg()
            h_ = lambda_t_ - lambda_s
            denoised_d = (1 - fac) * denoised + fac * denoised_2
-            x = (sigma_fn(t_next_) / sigma_fn(t)) * x - (t - t_next_).expm1() * denoised_d
+            x = (alpha_t / alpha_s) * (-h_).exp() * x - alpha_t * (-h_).expm1() * denoised_d
-            x = x + noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * su
+            if eta > 0 and s_noise > 0:
                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * su
    return x
@ -753,6 +798,7 @@ def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, disable=No
        old_denoised = denoised
    return x
@torch.no_grad()
 def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    """DPM-Solver++(2M) SDE."""
@ -768,9 +814,12 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    old_denoised = None
-    h_last = None
+    h, h_last = None, None
    h = None
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -781,26 +830,29 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
            x = denoised
        else:
            # DPM-Solver++(2M) SDE
-            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = s - t
+            h = lambda_t - lambda_s
-            eta_h = eta * h
+            h_eta = h * (eta + 1)
-            x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised
+            alpha_t = sigmas[i + 1] * lambda_t.exp()
            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x + alpha_t * (-h_eta).expm1().neg() * denoised
            if old_denoised is not None:
                r = h_last / h
                if solver_type == 'heun':
-                    x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
+                    x = x + alpha_t * ((-h_eta).expm1().neg() / (-h_eta) + 1) * (1 / r) * (denoised - old_denoised)
                elif solver_type == 'midpoint':
-                    x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)
+                    x = x + 0.5 * alpha_t * (-h_eta).expm1().neg() * (1 / r) * (denoised - old_denoised)
-            if eta:
+            if eta > 0 and s_noise > 0:
-                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise
+                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise
        old_denoised = denoised
        h_last = h
    return x
@torch.no_grad()
 def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """DPM-Solver++(3M) SDE."""
@ -814,6 +866,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    denoised_1, denoised_2 = None, None
    h, h_1, h_2 = None, None, None
@ -825,13 +881,16 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
            # Denoising step
            x = denoised
        else:
-            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = s - t
+            h = lambda_t - lambda_s
            h_eta = h * (eta + 1)
-            x = torch.exp(-h_eta) * x + (-h_eta).expm1().neg() * denoised
+            alpha_t = sigmas[i + 1] * lambda_t.exp()
            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x + alpha_t * (-h_eta).expm1().neg() * denoised
            if h_2 is not None:
                # DPM-Solver++(3M) SDE
                r0 = h_1 / h
                r1 = h_2 / h
                d1_0 = (denoised - denoised_1) / r0
@ -840,20 +899,22 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
                d2 = (d1_0 - d1_1) / (r0 + r1)
                phi_2 = h_eta.neg().expm1() / h_eta + 1
                phi_3 = phi_2 / h_eta - 0.5
-                x = x + phi_2 * d1 - phi_3 * d2
+                x = x + (alpha_t * phi_2) * d1 - (alpha_t * phi_3) * d2
            elif h_1 is not None:
                # DPM-Solver++(2M) SDE
                r = h_1 / h
                d = (denoised - denoised_1) / r
                phi_2 = h_eta.neg().expm1() / h_eta + 1
-                x = x + phi_2 * d
+                x = x + (alpha_t * phi_2) * d
-            if eta:
+            if eta > 0 and s_noise > 0:
                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise
        denoised_1, denoised_2 = denoised, denoised_1
        h_1, h_2 = h, h_1
    return x
@torch.no_grad()
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
@ -863,6 +924,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@torch.no_grad()
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
@ -872,6 +934,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@torch.no_grad()
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
@ -1009,7 +1072,9 @@ def sample_ipndm(model, x, sigmas, extra_args=None, callback=None, disable=None,
        d_cur = (x_cur - denoised) / t_cur
        order = min(max_order, i+1)
-        if order == 1:      # First Euler step.
+        if t_next == 0:     # Denoising step
            x_next = denoised
        elif order == 1:    # First Euler step.
            x_next = x_cur + (t_next - t_cur) * d_cur
        elif order == 2:    # Use one history point.
            x_next = x_cur + (t_next - t_cur) * (3 * d_cur - buffer_model[-1]) / 2
@ -1027,6 +1092,7 @@ def sample_ipndm(model, x, sigmas, extra_args=None, callback=None, disable=None,
    return x_next
 #From https://github.com/zju-pi/diff-sampler/blob/main/diff-solvers-main/solvers.py
 #under Apache 2 license
 def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=None, max_order=4):
@ -1050,7 +1116,9 @@ def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=Non
        d_cur = (x_cur - denoised) / t_cur
        order = min(max_order, i+1)
-        if order == 1:      # First Euler step.
+        if t_next == 0:     # Denoising step
            x_next = denoised
        elif order == 1:    # First Euler step.
            x_next = x_cur + (t_next - t_cur) * d_cur
        elif order == 2:    # Use one history point.
            h_n = (t_next - t_cur)
@ -1090,6 +1158,7 @@ def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=Non
    return x_next
 #From https://github.com/zju-pi/diff-sampler/blob/main/diff-solvers-main/solvers.py
 #under Apache 2 license
@torch.no_grad()
@ -1140,39 +1209,22 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None,
    return x_next
@torch.no_grad()
 def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
    extra_args = {} if extra_args is None else extra_args
    temp = [0]
    def post_cfg_function(args):
        temp[0] = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        sigma_hat = sigmas[i]
        denoised = model(x, sigma_hat * s_in, **extra_args)
        d = to_d(x, sigma_hat, temp[0])
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
        # Euler method
        x = denoised + d * sigmas[i + 1]
    return x
@torch.no_grad()
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps."""
+    """Ancestral sampling with Euler method steps (CFG++)."""
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    temp = [0]
+    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    uncond_denoised = None
    def post_cfg_function(args):
-        temp[0] = args["uncond_denoised"]
+        nonlocal uncond_denoised
        uncond_denoised = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
@ -1181,15 +1233,33 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], temp[0])
+        if sigmas[i + 1] == 0:
            # Denoising step
            x = denoised
        else:
            alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp()
            alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp()
            d = to_d(x, sigmas[i], alpha_s * uncond_denoised)   # to noise
            # DDIM stochastic sampling
            sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta)
            sigma_down = alpha_t * sigma_down
            # Euler method
-        x = denoised + d * sigma_down
+            x = alpha_t * denoised + sigma_down * d
-        if sigmas[i + 1] > 0:
+            if eta > 0 and s_noise > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
@torch.no_grad()
 def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
    """Euler method steps (CFG++)."""
    return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None)
@torch.no_grad()
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
@ -1346,6 +1416,7 @@ def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=N
 def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
@torch.no_grad()
 def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2., cfg_pp=False):
    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
@ -1372,31 +1443,32 @@ def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None,
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        dt = sigmas[i + 1] - sigmas[i]
-        if i == 0:
+        if sigmas[i + 1] == 0:
            # Denoising step
            x = denoised
        else:
            # Euler method
            if cfg_pp:
                x = denoised + d * sigmas[i + 1]
            else:
                x = x + d * dt
-        else:
+
            if i >= 1:
                # Gradient estimation
            if cfg_pp:
                d_bar = (ge_gamma - 1) * (d - old_d)
                x = denoised + d * sigmas[i + 1] + d_bar * dt
            else:
                d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
                x = x + d_bar * dt
        old_d = d
    return x
@torch.no_grad()
 def sample_gradient_estimation_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
    return sample_gradient_estimation(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, ge_gamma=ge_gamma, cfg_pp=True)
@torch.no_grad()
-def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
+def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1.0, noise_sampler=None, noise_scaler=None, max_stage=3):
-    """
+    """Extended Reverse-Time SDE solver (VP ER-SDE-Solver-3). arXiv: https://arxiv.org/abs/2309.06169.
    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
    """
    extra_args = {} if extra_args is None else extra_args
@ -1404,12 +1476,18 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
-    def default_noise_scaler(sigma):
+    def default_er_sde_noise_scaler(x):
-        return sigma * ((sigma ** 0.3).exp() + 10.0)
+        return x * ((x ** 0.3).exp() + 10.0)
-    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
+
    noise_scaler = default_er_sde_noise_scaler if noise_scaler is None else noise_scaler
    num_integration_points = 200.0
    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    half_log_snrs = sigma_to_half_log_snr(sigmas, model_sampling)
    er_lambdas = half_log_snrs.neg().exp()  # er_lambda_t = sigma_t / alpha_t
    old_denoised = None
    old_denoised_d = None
@ -1420,41 +1498,45 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
        stage_used = min(max_stage, i + 1)
        if sigmas[i + 1] == 0:
            x = denoised
        elif stage_used == 1:
            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
            x = r * x + (1 - r) * denoised
        else:
-            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
+            er_lambda_s, er_lambda_t = er_lambdas[i], er_lambdas[i + 1]
-            x = r * x + (1 - r) * denoised
+            alpha_s = sigmas[i] / er_lambda_s
            alpha_t = sigmas[i + 1] / er_lambda_t
            r_alpha = alpha_t / alpha_s
            r = noise_scaler(er_lambda_t) / noise_scaler(er_lambda_s)
-            dt = sigmas[i + 1] - sigmas[i]
+            # Stage 1 Euler
-            sigma_step_size = -dt / num_integration_points
+            x = r_alpha * r * x + alpha_t * (1 - r) * denoised
-            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
+
-            scaled_pos = noise_scaler(sigma_pos)
+            if stage_used >= 2:
                dt = er_lambda_t - er_lambda_s
                lambda_step_size = -dt / num_integration_points
                lambda_pos = er_lambda_t + point_indice * lambda_step_size
                scaled_pos = noise_scaler(lambda_pos)
                # Stage 2
-            s = torch.sum(1 / scaled_pos) * sigma_step_size
+                s = torch.sum(1 / scaled_pos) * lambda_step_size
-            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
+                denoised_d = (denoised - old_denoised) / (er_lambda_s - er_lambdas[i - 1])
-            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
+                x = x + alpha_t * (dt + s * noise_scaler(er_lambda_t)) * denoised_d
                if stage_used >= 3:
                    # Stage 3
-                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
+                    s_u = torch.sum((lambda_pos - er_lambda_s) / scaled_pos) * lambda_step_size
-                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
+                    denoised_u = (denoised_d - old_denoised_d) / ((er_lambda_s - er_lambdas[i - 2]) / 2)
-                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
+                    x = x + alpha_t * ((dt ** 2) / 2 + s_u * noise_scaler(er_lambda_t)) * denoised_u
                old_denoised_d = denoised_d
-        if s_noise != 0 and sigmas[i + 1] > 0:
+            if s_noise > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
+                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (er_lambda_t ** 2 - er_lambda_s ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
        old_denoised = denoised
    return x
@torch.no_grad()
 def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
-    '''
+    """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
-    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
+    arXiv: https://arxiv.org/abs/2305.14267
-    Arxiv: https://arxiv.org/abs/2305.14267
+    """
    '''
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
@ -1462,6 +1544,11 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
    inject_noise = eta > 0 and s_noise > 0
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
@ -1469,80 +1556,206 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
        if sigmas[i + 1] == 0:
            x = denoised
        else:
-            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
+            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = t_next - t
+            h = lambda_t - lambda_s
            h_eta = h * (eta + 1)
-            s = t + r * h
+            lambda_s_1 = lambda_s + r * h
            fac = 1 / (2 * r)
-            sigma_s = s.neg().exp()
+            sigma_s_1 = sigma_fn(lambda_s_1)
            # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t)
            alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
            alpha_t = sigmas[i + 1] * lambda_t.exp()
            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
            if inject_noise:
                # 0 < r < 1
                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
-                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
+                noise_coeff_2 = (-r * h * eta).exp() * (-2 * (1 - r) * h * eta).expm1().neg().sqrt()
-                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
+                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigmas[i + 1])
            # Step 1
-            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
+            x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised
            if inject_noise:
                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
            # Step 2
            denoised_d = (1 - fac) * denoised + fac * denoised_2
            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
    return x
@torch.no_grad()
 def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
    '''
    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
    Arxiv: https://arxiv.org/abs/2305.14267
    '''
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    inject_noise = eta > 0 and s_noise > 0
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            x = denoised
        else:
            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
            h = t_next - t
            h_eta = h * (eta + 1)
            s_1 = t + r_1 * h
            s_2 = t + r_2 * h
            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
            if inject_noise:
                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
            # Step 1
            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
            if inject_noise:
                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
            # Step 2
-            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
+            denoised_d = (1 - fac) * denoised + fac * denoised_2
            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_2 * denoised_d
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
    return x
@torch.no_grad()
 def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
    """SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 3.
    arXiv: https://arxiv.org/abs/2305.14267
    """
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    inject_noise = eta > 0 and s_noise > 0
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            x = denoised
        else:
            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
            h = lambda_t - lambda_s
            h_eta = h * (eta + 1)
            lambda_s_1 = lambda_s + r_1 * h
            lambda_s_2 = lambda_s + r_2 * h
            sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2)
            # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t)
            alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
            alpha_s_2 = sigma_s_2 * lambda_s_2.exp()
            alpha_t = sigmas[i + 1] * lambda_t.exp()
            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
            if inject_noise:
                # 0 < r_1 < r_2 < 1
                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
                noise_coeff_2 = (-r_1 * h * eta).exp() * (-2 * (r_2 - r_1) * h * eta).expm1().neg().sqrt()
                noise_coeff_3 = (-r_2 * h * eta).exp() * (-2 * (1 - r_2) * h * eta).expm1().neg().sqrt()
                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
            # Step 1
            x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised
            if inject_noise:
                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
            # Step 2
            x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * coeff_2 * denoised + (r_2 / r_1) * alpha_s_2 * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
            if inject_noise:
                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
            # Step 3
-            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
+            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_3 * denoised + (1. / r_2) * alpha_t * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
    return x
@torch.no_grad()
 def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, use_pece=False, simple_order_2=False):
    """Stochastic Adams Solver with predictor-corrector method (NeurIPS 2023)."""
    if len(sigmas) <= 1:
        return x
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
    if tau_func is None:
        # Use default interval for stochastic sampling
        start_sigma = model_sampling.percent_to_sigma(0.2)
        end_sigma = model_sampling.percent_to_sigma(0.8)
        tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=1.0)
    max_used_order = max(predictor_order, corrector_order)
    x_pred = x  # x: current state, x_pred: predicted next state
    h = 0.0
    tau_t = 0.0
    noise = 0.0
    pred_list = []
    # Lower order near the end to improve stability
    lower_order_to_end = sigmas[-1].item() == 0
    for i in trange(len(sigmas) - 1, disable=disable):
        # Evaluation
        denoised = model(x_pred, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({"x": x_pred, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
        pred_list.append(denoised)
        pred_list = pred_list[-max_used_order:]
        predictor_order_used = min(predictor_order, len(pred_list))
        if i == 0 or (sigmas[i + 1] == 0 and not use_pece):
            corrector_order_used = 0
        else:
            corrector_order_used = min(corrector_order, len(pred_list))
        if lower_order_to_end:
            predictor_order_used = min(predictor_order_used, len(sigmas) - 2 - i)
            corrector_order_used = min(corrector_order_used, len(sigmas) - 1 - i)
        # Corrector
        if corrector_order_used == 0:
            # Update by the predicted state
            x = x_pred
        else:
            curr_lambdas = lambdas[i - corrector_order_used + 1:i + 1]
            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
                sigmas[i],
                curr_lambdas,
                lambdas[i - 1],
                lambdas[i],
                tau_t,
                simple_order_2,
                is_corrector_step=True,
            )
            pred_mat = torch.stack(pred_list[-corrector_order_used:], dim=1)    # (B, K, ...)
            corr_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
            x = sigmas[i] / sigmas[i - 1] * (-(tau_t ** 2) * h).exp() * x + corr_res
            if tau_t > 0 and s_noise > 0:
                # The noise from the previous predictor step
                x = x + noise
            if use_pece:
                # Evaluate the corrected state
                denoised = model(x, sigmas[i] * s_in, **extra_args)
                pred_list[-1] = denoised
        # Predictor
        if sigmas[i + 1] == 0:
            # Denoising step
            x = denoised
        else:
            tau_t = tau_func(sigmas[i + 1])
            curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1]
            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
                sigmas[i + 1],
                curr_lambdas,
                lambdas[i],
                lambdas[i + 1],
                tau_t,
                simple_order_2,
                is_corrector_step=False,
            )
            pred_mat = torch.stack(pred_list[-predictor_order_used:], dim=1)    # (B, K, ...)
            pred_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
            h = lambdas[i + 1] - lambdas[i]
            x_pred = sigmas[i + 1] / sigmas[i] * (-(tau_t ** 2) * h).exp() * x + pred_res
            if tau_t > 0 and s_noise > 0:
                noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise
                x_pred = x_pred + noise
    return x
@torch.no_grad()
 def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
    """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
    return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -457,6 +457,82 @@ class Wan21(LatentFormat):
        latents_std = self.latents_std.to(latent.device, latent.dtype)
        return latent * latents_std / self.scale_factor + latents_mean
 class Wan22(Wan21):
    latent_channels = 48
    latent_dimensions = 3
    latent_rgb_factors = [
            [ 0.0119,  0.0103,  0.0046],
            [-0.1062, -0.0504,  0.0165],
            [ 0.0140,  0.0409,  0.0491],
            [-0.0813, -0.0677,  0.0607],
            [ 0.0656,  0.0851,  0.0808],
            [ 0.0264,  0.0463,  0.0912],
            [ 0.0295,  0.0326,  0.0590],
            [-0.0244, -0.0270,  0.0025],
            [ 0.0443, -0.0102,  0.0288],
            [-0.0465, -0.0090, -0.0205],
            [ 0.0359,  0.0236,  0.0082],
            [-0.0776,  0.0854,  0.1048],
            [ 0.0564,  0.0264,  0.0561],
            [ 0.0006,  0.0594,  0.0418],
            [-0.0319, -0.0542, -0.0637],
            [-0.0268,  0.0024,  0.0260],
            [ 0.0539,  0.0265,  0.0358],
            [-0.0359, -0.0312, -0.0287],
            [-0.0285, -0.1032, -0.1237],
            [ 0.1041,  0.0537,  0.0622],
            [-0.0086, -0.0374, -0.0051],
            [ 0.0390,  0.0670,  0.2863],
            [ 0.0069,  0.0144,  0.0082],
            [ 0.0006, -0.0167,  0.0079],
            [ 0.0313, -0.0574, -0.0232],
            [-0.1454, -0.0902, -0.0481],
            [ 0.0714,  0.0827,  0.0447],
            [-0.0304, -0.0574, -0.0196],
            [ 0.0401,  0.0384,  0.0204],
            [-0.0758, -0.0297, -0.0014],
            [ 0.0568,  0.1307,  0.1372],
            [-0.0055, -0.0310, -0.0380],
            [ 0.0239, -0.0305,  0.0325],
            [-0.0663, -0.0673, -0.0140],
            [-0.0416, -0.0047, -0.0023],
            [ 0.0166,  0.0112, -0.0093],
            [-0.0211,  0.0011,  0.0331],
            [ 0.1833,  0.1466,  0.2250],
            [-0.0368,  0.0370,  0.0295],
            [-0.3441, -0.3543, -0.2008],
            [-0.0479, -0.0489, -0.0420],
            [-0.0660, -0.0153,  0.0800],
            [-0.0101,  0.0068,  0.0156],
            [-0.0690, -0.0452, -0.0927],
            [-0.0145,  0.0041,  0.0015],
            [ 0.0421,  0.0451,  0.0373],
            [ 0.0504, -0.0483, -0.0356],
            [-0.0837,  0.0168,  0.0055]
        ]
    latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388]
    def __init__(self):
        self.scale_factor = 1.0
        self.latents_mean = torch.tensor([
                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
                -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
                -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
                -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
                0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667,
            ]).view(1, self.latent_channels, 1, 1, 1)
        self.latents_std = torch.tensor([
                0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
                0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
                0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
                0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
                0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
                0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
            ]).view(1, self.latent_channels, 1, 1, 1)
 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -254,13 +254,12 @@ class Chroma(nn.Module):
    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
-        patch_size = 2
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size)
-        h_len = ((h + (patch_size // 2)) // patch_size)
+        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
+        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
@ -268,4 +267,4 @@ class Chroma(nn.Module):
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h,:w]
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@ -26,16 +26,6 @@ from torch import nn
 from comfy.ldm.modules.attention import optimized_attention
 def apply_rotary_pos_emb(
    t: torch.Tensor,
    freqs: torch.Tensor,
 ) -> torch.Tensor:
    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
    return t_out
 def get_normalization(name: str, channels: int, weight_args={}, operations=None):
    if name == "I":
        return nn.Identity()
--- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
@ -58,7 +58,8 @@ def is_odd(n: int) -> bool:
 def nonlinearity(x):
-    return x * torch.sigmoid(x)
+    # x * sigmoid(x)
    return torch.nn.functional.silu(x)
 def Normalize(in_channels, num_groups=32):
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@ -66,15 +66,16 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        h_extrapolation_ratio: float = 1.0,
        w_extrapolation_ratio: float = 1.0,
        t_extrapolation_ratio: float = 1.0,
        enable_fps_modulation: bool = True,
        device=None,
        **kwargs,  # used for compatibility with other positional embeddings; unused in this class
    ):
        del kwargs
        super().__init__()
        self.register_buffer("seq", torch.arange(max(len_h, len_w, len_t), dtype=torch.float, device=device))
        self.base_fps = base_fps
        self.max_h = len_h
        self.max_w = len_w
        self.enable_fps_modulation = enable_fps_modulation
        dim = head_dim
        dim_h = dim // 6 * 2
@ -132,21 +133,19 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))
        B, T, H, W, _ = B_T_H_W_C
        seq = torch.arange(max(H, W, T), dtype=torch.float, device=device)
        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
        assert (
            uniform_fps or B == 1 or T == 1
        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
-        assert (
+        half_emb_h = torch.outer(seq[:H].to(device=device), h_spatial_freqs)
-            H <= self.max_h and W <= self.max_w
+        half_emb_w = torch.outer(seq[:W].to(device=device), w_spatial_freqs)
        ), f"Input dimensions (H={H}, W={W}) exceed the maximum dimensions (max_h={self.max_h}, max_w={self.max_w})"
        half_emb_h = torch.outer(self.seq[:H].to(device=device), h_spatial_freqs)
        half_emb_w = torch.outer(self.seq[:W].to(device=device), w_spatial_freqs)
        # apply sequence scaling in temporal dimension
-        if fps is None:  # image case
+        if fps is None or self.enable_fps_modulation is False:  # image case
-            half_emb_t = torch.outer(self.seq[:T].to(device=device), temporal_freqs)
+            half_emb_t = torch.outer(seq[:T].to(device=device), temporal_freqs)
        else:
-            half_emb_t = torch.outer(self.seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
+            half_emb_t = torch.outer(seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@ -0,0 +1,864 @@
 # original code from: https://github.com/nvidia-cosmos/cosmos-predict2
 import torch
 from torch import nn
 from einops import rearrange
 from einops.layers.torch import Rearrange
 import logging
 from typing import Callable, Optional, Tuple
 import math
 from .position_embedding import VideoRopePosition3DEmb, LearnablePosEmbAxis
 from torchvision import transforms
 from comfy.ldm.modules.attention import optimized_attention
 def apply_rotary_pos_emb(
    t: torch.Tensor,
    freqs: torch.Tensor,
 ) -> torch.Tensor:
    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
    return t_out
 # ---------------------- Feed Forward Network -----------------------
 class GPT2FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, device=None, dtype=None, operations=None) -> None:
        super().__init__()
        self.activation = nn.GELU()
        self.layer1 = operations.Linear(d_model, d_ff, bias=False, device=device, dtype=dtype)
        self.layer2 = operations.Linear(d_ff, d_model, bias=False, device=device, dtype=dtype)
        self._layer_id = None
        self._dim = d_model
        self._hidden_dim = d_ff
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
        return x
 def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor) -> torch.Tensor:
    """Computes multi-head attention using PyTorch's native implementation.
    This function provides a PyTorch backend alternative to Transformer Engine's attention operation.
    It rearranges the input tensors to match PyTorch's expected format, computes scaled dot-product
    attention, and rearranges the output back to the original format.
    The input tensor names use the following dimension conventions:
    - B: batch size
    - S: sequence length
    - H: number of attention heads
    - D: head dimension
    Args:
        q_B_S_H_D: Query tensor with shape (batch, seq_len, n_heads, head_dim)
        k_B_S_H_D: Key tensor with shape (batch, seq_len, n_heads, head_dim)
        v_B_S_H_D: Value tensor with shape (batch, seq_len, n_heads, head_dim)
    Returns:
        Attention output tensor with shape (batch, seq_len, n_heads * head_dim)
    """
    in_q_shape = q_B_S_H_D.shape
    in_k_shape = k_B_S_H_D.shape
    q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1])
    k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
    v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True)
 class Attention(nn.Module):
    """
    A flexible attention module supporting both self-attention and cross-attention mechanisms.
    This module implements a multi-head attention layer that can operate in either self-attention
    or cross-attention mode. The mode is determined by whether a context dimension is provided.
    The implementation uses scaled dot-product attention and supports optional bias terms and
    dropout regularization.
    Args:
        query_dim (int): The dimensionality of the query vectors.
        context_dim (int, optional): The dimensionality of the context (key/value) vectors.
            If None, the module operates in self-attention mode using query_dim. Default: None
        n_heads (int, optional): Number of attention heads for multi-head attention. Default: 8
        head_dim (int, optional): The dimension of each attention head. Default: 64
        dropout (float, optional): Dropout probability applied to the output. Default: 0.0
        qkv_format (str, optional): Format specification for QKV tensors. Default: "bshd"
        backend (str, optional): Backend to use for the attention operation. Default: "transformer_engine"
    Examples:
        >>> # Self-attention with 512 dimensions and 8 heads
        >>> self_attn = Attention(query_dim=512)
        >>> x = torch.randn(32, 16, 512)  # (batch_size, seq_len, dim)
        >>> out = self_attn(x)  # (32, 16, 512)
        >>> # Cross-attention
        >>> cross_attn = Attention(query_dim=512, context_dim=256)
        >>> query = torch.randn(32, 16, 512)
        >>> context = torch.randn(32, 8, 256)
        >>> out = cross_attn(query, context)  # (32, 16, 512)
    """
    def __init__(
        self,
        query_dim: int,
        context_dim: Optional[int] = None,
        n_heads: int = 8,
        head_dim: int = 64,
        dropout: float = 0.0,
        device=None,
        dtype=None,
        operations=None,
    ) -> None:
        super().__init__()
        logging.debug(
            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
            f"{n_heads} heads with a dimension of {head_dim}."
        )
        self.is_selfattn = context_dim is None  # self attention
        context_dim = query_dim if context_dim is None else context_dim
        inner_dim = head_dim * n_heads
        self.n_heads = n_heads
        self.head_dim = head_dim
        self.query_dim = query_dim
        self.context_dim = context_dim
        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
        self.v_norm = nn.Identity()
        self.output_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
        self.output_dropout = nn.Dropout(dropout) if dropout > 1e-4 else nn.Identity()
        self.attn_op = torch_attention_op
        self._query_dim = query_dim
        self._context_dim = context_dim
        self._inner_dim = inner_dim
    def compute_qkv(
        self,
        x: torch.Tensor,
        context: Optional[torch.Tensor] = None,
        rope_emb: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        q = self.q_proj(x)
        context = x if context is None else context
        k = self.k_proj(context)
        v = self.v_proj(context)
        q, k, v = map(
            lambda t: rearrange(t, "b ... (h d) -> b ... h d", h=self.n_heads, d=self.head_dim),
            (q, k, v),
        )
        def apply_norm_and_rotary_pos_emb(
            q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, rope_emb: Optional[torch.Tensor]
        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
            q = self.q_norm(q)
            k = self.k_norm(k)
            v = self.v_norm(v)
            if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
                q = apply_rotary_pos_emb(q, rope_emb)
                k = apply_rotary_pos_emb(k, rope_emb)
            return q, k, v
        q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb)
        return q, k, v
    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
        result = self.attn_op(q, k, v)  # [B, S, H, D]
        return self.output_dropout(self.output_proj(result))
    def forward(
        self,
        x: torch.Tensor,
        context: Optional[torch.Tensor] = None,
        rope_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """
        Args:
            x (Tensor): The query tensor of shape [B, Mq, K]
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb)
        return self.compute_attention(q, k, v)
 class Timesteps(nn.Module):
    def __init__(self, num_channels: int):
        super().__init__()
        self.num_channels = num_channels
    def forward(self, timesteps_B_T: torch.Tensor) -> torch.Tensor:
        assert timesteps_B_T.ndim == 2, f"Expected 2D input, got {timesteps_B_T.ndim}"
        timesteps = timesteps_B_T.flatten().float()
        half_dim = self.num_channels // 2
        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
        exponent = exponent / (half_dim - 0.0)
        emb = torch.exp(exponent)
        emb = timesteps[:, None].float() * emb[None, :]
        sin_emb = torch.sin(emb)
        cos_emb = torch.cos(emb)
        emb = torch.cat([cos_emb, sin_emb], dim=-1)
        return rearrange(emb, "(b t) d -> b t d", b=timesteps_B_T.shape[0], t=timesteps_B_T.shape[1])
 class TimestepEmbedding(nn.Module):
    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False, device=None, dtype=None, operations=None):
        super().__init__()
        logging.debug(
            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
        )
        self.in_dim = in_features
        self.out_dim = out_features
        self.linear_1 = operations.Linear(in_features, out_features, bias=not use_adaln_lora, device=device, dtype=dtype)
        self.activation = nn.SiLU()
        self.use_adaln_lora = use_adaln_lora
        if use_adaln_lora:
            self.linear_2 = operations.Linear(out_features, 3 * out_features, bias=False, device=device, dtype=dtype)
        else:
            self.linear_2 = operations.Linear(out_features, out_features, bias=False, device=device, dtype=dtype)
    def forward(self, sample: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        emb = self.linear_1(sample)
        emb = self.activation(emb)
        emb = self.linear_2(emb)
        if self.use_adaln_lora:
            adaln_lora_B_T_3D = emb
            emb_B_T_D = sample
        else:
            adaln_lora_B_T_3D = None
            emb_B_T_D = emb
        return emb_B_T_D, adaln_lora_B_T_3D
 class PatchEmbed(nn.Module):
    """
    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
    making it suitable for video and image processing tasks. It supports dividing the input into patches
    and embedding each patch into a vector of size `out_channels`.
    Parameters:
    - spatial_patch_size (int): The size of each spatial patch.
    - temporal_patch_size (int): The size of each temporal patch.
    - in_channels (int): Number of input channels. Default: 3.
    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
    """
    def __init__(
        self,
        spatial_patch_size: int,
        temporal_patch_size: int,
        in_channels: int = 3,
        out_channels: int = 768,
        device=None, dtype=None, operations=None
    ):
        super().__init__()
        self.spatial_patch_size = spatial_patch_size
        self.temporal_patch_size = temporal_patch_size
        self.proj = nn.Sequential(
            Rearrange(
                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
                r=temporal_patch_size,
                m=spatial_patch_size,
                n=spatial_patch_size,
            ),
            operations.Linear(
                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=False, device=device, dtype=dtype
            ),
        )
        self.dim = in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the PatchEmbed module.
        Parameters:
        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
            B is the batch size,
            C is the number of channels,
            T is the temporal dimension,
            H is the height, and
            W is the width of the input.
        Returns:
        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
        """
        assert x.dim() == 5
        _, _, T, H, W = x.shape
        assert (
            H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
        ), f"H,W {(H, W)} should be divisible by spatial_patch_size {self.spatial_patch_size}"
        assert T % self.temporal_patch_size == 0
        x = self.proj(x)
        return x
 class FinalLayer(nn.Module):
    """
    The final layer of video DiT.
    """
    def __init__(
        self,
        hidden_size: int,
        spatial_patch_size: int,
        temporal_patch_size: int,
        out_channels: int,
        use_adaln_lora: bool = False,
        adaln_lora_dim: int = 256,
        device=None, dtype=None, operations=None
    ):
        super().__init__()
        self.layer_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = operations.Linear(
            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype
        )
        self.hidden_size = hidden_size
        self.n_adaln_chunks = 2
        self.use_adaln_lora = use_adaln_lora
        self.adaln_lora_dim = adaln_lora_dim
        if use_adaln_lora:
            self.adaln_modulation = nn.Sequential(
                nn.SiLU(),
                operations.Linear(hidden_size, adaln_lora_dim, bias=False, device=device, dtype=dtype),
                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype),
            )
        else:
            self.adaln_modulation = nn.Sequential(
                nn.SiLU(), operations.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype)
            )
    def forward(
        self,
        x_B_T_H_W_D: torch.Tensor,
        emb_B_T_D: torch.Tensor,
        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
    ):
        if self.use_adaln_lora:
            assert adaln_lora_B_T_3D is not None
            shift_B_T_D, scale_B_T_D = (
                self.adaln_modulation(emb_B_T_D) + adaln_lora_B_T_3D[:, :, : 2 * self.hidden_size]
            ).chunk(2, dim=-1)
        else:
            shift_B_T_D, scale_B_T_D = self.adaln_modulation(emb_B_T_D).chunk(2, dim=-1)
        shift_B_T_1_1_D, scale_B_T_1_1_D = rearrange(shift_B_T_D, "b t d -> b t 1 1 d"), rearrange(
            scale_B_T_D, "b t d -> b t 1 1 d"
        )
        def _fn(
            _x_B_T_H_W_D: torch.Tensor,
            _norm_layer: nn.Module,
            _scale_B_T_1_1_D: torch.Tensor,
            _shift_B_T_1_1_D: torch.Tensor,
        ) -> torch.Tensor:
            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
        x_B_T_H_W_D = _fn(x_B_T_H_W_D, self.layer_norm, scale_B_T_1_1_D, shift_B_T_1_1_D)
        x_B_T_H_W_O = self.linear(x_B_T_H_W_D)
        return x_B_T_H_W_O
 class Block(nn.Module):
    """
    A transformer block that combines self-attention, cross-attention and MLP layers with AdaLN modulation.
    Each component (self-attention, cross-attention, MLP) has its own layer normalization and AdaLN modulation.
    Parameters:
        x_dim (int): Dimension of input features
        context_dim (int): Dimension of context features for cross-attention
        num_heads (int): Number of attention heads
        mlp_ratio (float): Multiplier for MLP hidden dimension. Default: 4.0
        use_adaln_lora (bool): Whether to use AdaLN-LoRA modulation. Default: False
        adaln_lora_dim (int): Hidden dimension for AdaLN-LoRA layers. Default: 256
    The block applies the following sequence:
    1. Self-attention with AdaLN modulation
    2. Cross-attention with AdaLN modulation
    3. MLP with AdaLN modulation
    Each component uses skip connections and layer normalization.
    """
    def __init__(
        self,
        x_dim: int,
        context_dim: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        use_adaln_lora: bool = False,
        adaln_lora_dim: int = 256,
        device=None,
        dtype=None,
        operations=None,
    ):
        super().__init__()
        self.x_dim = x_dim
        self.layer_norm_self_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
        self.self_attn = Attention(x_dim, None, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations)
        self.layer_norm_cross_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
        self.cross_attn = Attention(
            x_dim, context_dim, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations
        )
        self.layer_norm_mlp = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
        self.mlp = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), device=device, dtype=dtype, operations=operations)
        self.use_adaln_lora = use_adaln_lora
        if self.use_adaln_lora:
            self.adaln_modulation_self_attn = nn.Sequential(
                nn.SiLU(),
                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
            )
            self.adaln_modulation_cross_attn = nn.Sequential(
                nn.SiLU(),
                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
            )
            self.adaln_modulation_mlp = nn.Sequential(
                nn.SiLU(),
                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
            )
        else:
            self.adaln_modulation_self_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
            self.adaln_modulation_cross_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
            self.adaln_modulation_mlp = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
    def forward(
        self,
        x_B_T_H_W_D: torch.Tensor,
        emb_B_T_D: torch.Tensor,
        crossattn_emb: torch.Tensor,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if extra_per_block_pos_emb is not None:
            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
        if self.use_adaln_lora:
            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = (
                self.adaln_modulation_self_attn(emb_B_T_D) + adaln_lora_B_T_3D
            ).chunk(3, dim=-1)
            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = (
                self.adaln_modulation_cross_attn(emb_B_T_D) + adaln_lora_B_T_3D
            ).chunk(3, dim=-1)
            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = (
                self.adaln_modulation_mlp(emb_B_T_D) + adaln_lora_B_T_3D
            ).chunk(3, dim=-1)
        else:
            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = self.adaln_modulation_self_attn(
                emb_B_T_D
            ).chunk(3, dim=-1)
            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = self.adaln_modulation_cross_attn(
                emb_B_T_D
            ).chunk(3, dim=-1)
            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = self.adaln_modulation_mlp(emb_B_T_D).chunk(3, dim=-1)
        # Reshape tensors from (B, T, D) to (B, T, 1, 1, D) for broadcasting
        shift_self_attn_B_T_1_1_D = rearrange(shift_self_attn_B_T_D, "b t d -> b t 1 1 d")
        scale_self_attn_B_T_1_1_D = rearrange(scale_self_attn_B_T_D, "b t d -> b t 1 1 d")
        gate_self_attn_B_T_1_1_D = rearrange(gate_self_attn_B_T_D, "b t d -> b t 1 1 d")
        shift_cross_attn_B_T_1_1_D = rearrange(shift_cross_attn_B_T_D, "b t d -> b t 1 1 d")
        scale_cross_attn_B_T_1_1_D = rearrange(scale_cross_attn_B_T_D, "b t d -> b t 1 1 d")
        gate_cross_attn_B_T_1_1_D = rearrange(gate_cross_attn_B_T_D, "b t d -> b t 1 1 d")
        shift_mlp_B_T_1_1_D = rearrange(shift_mlp_B_T_D, "b t d -> b t 1 1 d")
        scale_mlp_B_T_1_1_D = rearrange(scale_mlp_B_T_D, "b t d -> b t 1 1 d")
        gate_mlp_B_T_1_1_D = rearrange(gate_mlp_B_T_D, "b t d -> b t 1 1 d")
        B, T, H, W, D = x_B_T_H_W_D.shape
        def _fn(_x_B_T_H_W_D, _norm_layer, _scale_B_T_1_1_D, _shift_B_T_1_1_D):
            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
        normalized_x_B_T_H_W_D = _fn(
            x_B_T_H_W_D,
            self.layer_norm_self_attn,
            scale_self_attn_B_T_1_1_D,
            shift_self_attn_B_T_1_1_D,
        )
        result_B_T_H_W_D = rearrange(
            self.self_attn(
                # normalized_x_B_T_HW_D,
                rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                None,
                rope_emb=rope_emb_L_1_1_D,
            ),
            "b (t h w) d -> b t h w d",
            t=T,
            h=H,
            w=W,
        )
        x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D * result_B_T_H_W_D
        def _x_fn(
            _x_B_T_H_W_D: torch.Tensor,
            layer_norm_cross_attn: Callable,
            _scale_cross_attn_B_T_1_1_D: torch.Tensor,
            _shift_cross_attn_B_T_1_1_D: torch.Tensor,
        ) -> torch.Tensor:
            _normalized_x_B_T_H_W_D = _fn(
                _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D
            )
            _result_B_T_H_W_D = rearrange(
                self.cross_attn(
                    rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                    crossattn_emb,
                    rope_emb=rope_emb_L_1_1_D,
                ),
                "b (t h w) d -> b t h w d",
                t=T,
                h=H,
                w=W,
            )
            return _result_B_T_H_W_D
        result_B_T_H_W_D = _x_fn(
            x_B_T_H_W_D,
            self.layer_norm_cross_attn,
            scale_cross_attn_B_T_1_1_D,
            shift_cross_attn_B_T_1_1_D,
        )
        x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D
        normalized_x_B_T_H_W_D = _fn(
            x_B_T_H_W_D,
            self.layer_norm_mlp,
            scale_mlp_B_T_1_1_D,
            shift_mlp_B_T_1_1_D,
        )
        result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D)
        x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D * result_B_T_H_W_D
        return x_B_T_H_W_D
 class MiniTrainDIT(nn.Module):
    """
    A clean impl of DIT that can load and  reproduce the training results of the original DIT model in~(cosmos 1)
    A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
    Args:
        max_img_h (int): Maximum height of the input images.
        max_img_w (int): Maximum width of the input images.
        max_frames (int): Maximum number of frames in the video sequence.
        in_channels (int): Number of input channels (e.g., RGB channels for color images).
        out_channels (int): Number of output channels.
        patch_spatial (tuple): Spatial resolution of patches for input processing.
        patch_temporal (int): Temporal resolution of patches for input processing.
        concat_padding_mask (bool): If True, includes a mask channel in the input to handle padding.
        model_channels (int): Base number of channels used throughout the model.
        num_blocks (int): Number of transformer blocks.
        num_heads (int): Number of heads in the multi-head attention layers.
        mlp_ratio (float): Expansion ratio for MLP blocks.
        crossattn_emb_channels (int): Number of embedding channels for cross-attention.
        pos_emb_cls (str): Type of positional embeddings.
        pos_emb_learnable (bool): Whether positional embeddings are learnable.
        pos_emb_interpolation (str): Method for interpolating positional embeddings.
        min_fps (int): Minimum frames per second.
        max_fps (int): Maximum frames per second.
        use_adaln_lora (bool): Whether to use AdaLN-LoRA.
        adaln_lora_dim (int): Dimension for AdaLN-LoRA.
        rope_h_extrapolation_ratio (float): Height extrapolation ratio for RoPE.
        rope_w_extrapolation_ratio (float): Width extrapolation ratio for RoPE.
        rope_t_extrapolation_ratio (float): Temporal extrapolation ratio for RoPE.
        extra_per_block_abs_pos_emb (bool): Whether to use extra per-block absolute positional embeddings.
        extra_h_extrapolation_ratio (float): Height extrapolation ratio for extra embeddings.
        extra_w_extrapolation_ratio (float): Width extrapolation ratio for extra embeddings.
        extra_t_extrapolation_ratio (float): Temporal extrapolation ratio for extra embeddings.
    """
    def __init__(
        self,
        max_img_h: int,
        max_img_w: int,
        max_frames: int,
        in_channels: int,
        out_channels: int,
        patch_spatial: int,  # tuple,
        patch_temporal: int,
        concat_padding_mask: bool = True,
        # attention settings
        model_channels: int = 768,
        num_blocks: int = 10,
        num_heads: int = 16,
        mlp_ratio: float = 4.0,
        # cross attention settings
        crossattn_emb_channels: int = 1024,
        # positional embedding settings
        pos_emb_cls: str = "sincos",
        pos_emb_learnable: bool = False,
        pos_emb_interpolation: str = "crop",
        min_fps: int = 1,
        max_fps: int = 30,
        use_adaln_lora: bool = False,
        adaln_lora_dim: int = 256,
        rope_h_extrapolation_ratio: float = 1.0,
        rope_w_extrapolation_ratio: float = 1.0,
        rope_t_extrapolation_ratio: float = 1.0,
        extra_per_block_abs_pos_emb: bool = False,
        extra_h_extrapolation_ratio: float = 1.0,
        extra_w_extrapolation_ratio: float = 1.0,
        extra_t_extrapolation_ratio: float = 1.0,
        rope_enable_fps_modulation: bool = True,
        image_model=None,
        device=None,
        dtype=None,
        operations=None,
    ) -> None:
        super().__init__()
        self.dtype = dtype
        self.max_img_h = max_img_h
        self.max_img_w = max_img_w
        self.max_frames = max_frames
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.patch_spatial = patch_spatial
        self.patch_temporal = patch_temporal
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.model_channels = model_channels
        self.concat_padding_mask = concat_padding_mask
        # positional embedding settings
        self.pos_emb_cls = pos_emb_cls
        self.pos_emb_learnable = pos_emb_learnable
        self.pos_emb_interpolation = pos_emb_interpolation
        self.min_fps = min_fps
        self.max_fps = max_fps
        self.rope_h_extrapolation_ratio = rope_h_extrapolation_ratio
        self.rope_w_extrapolation_ratio = rope_w_extrapolation_ratio
        self.rope_t_extrapolation_ratio = rope_t_extrapolation_ratio
        self.extra_per_block_abs_pos_emb = extra_per_block_abs_pos_emb
        self.extra_h_extrapolation_ratio = extra_h_extrapolation_ratio
        self.extra_w_extrapolation_ratio = extra_w_extrapolation_ratio
        self.extra_t_extrapolation_ratio = extra_t_extrapolation_ratio
        self.rope_enable_fps_modulation = rope_enable_fps_modulation
        self.build_pos_embed(device=device, dtype=dtype)
        self.use_adaln_lora = use_adaln_lora
        self.adaln_lora_dim = adaln_lora_dim
        self.t_embedder = nn.Sequential(
            Timesteps(model_channels),
            TimestepEmbedding(model_channels, model_channels, use_adaln_lora=use_adaln_lora, device=device, dtype=dtype, operations=operations,),
        )
        in_channels = in_channels + 1 if concat_padding_mask else in_channels
        self.x_embedder = PatchEmbed(
            spatial_patch_size=patch_spatial,
            temporal_patch_size=patch_temporal,
            in_channels=in_channels,
            out_channels=model_channels,
            device=device, dtype=dtype, operations=operations,
        )
        self.blocks = nn.ModuleList(
            [
                Block(
                    x_dim=model_channels,
                    context_dim=crossattn_emb_channels,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    use_adaln_lora=use_adaln_lora,
                    adaln_lora_dim=adaln_lora_dim,
                    device=device, dtype=dtype, operations=operations,
                )
                for _ in range(num_blocks)
            ]
        )
        self.final_layer = FinalLayer(
            hidden_size=self.model_channels,
            spatial_patch_size=self.patch_spatial,
            temporal_patch_size=self.patch_temporal,
            out_channels=self.out_channels,
            use_adaln_lora=self.use_adaln_lora,
            adaln_lora_dim=self.adaln_lora_dim,
            device=device, dtype=dtype, operations=operations,
        )
        self.t_embedding_norm = operations.RMSNorm(model_channels, eps=1e-6, device=device, dtype=dtype)
    def build_pos_embed(self, device=None, dtype=None) -> None:
        if self.pos_emb_cls == "rope3d":
            cls_type = VideoRopePosition3DEmb
        else:
            raise ValueError(f"Unknown pos_emb_cls {self.pos_emb_cls}")
        logging.debug(f"Building positional embedding with {self.pos_emb_cls} class, impl {cls_type}")
        kwargs = dict(
            model_channels=self.model_channels,
            len_h=self.max_img_h // self.patch_spatial,
            len_w=self.max_img_w // self.patch_spatial,
            len_t=self.max_frames // self.patch_temporal,
            max_fps=self.max_fps,
            min_fps=self.min_fps,
            is_learnable=self.pos_emb_learnable,
            interpolation=self.pos_emb_interpolation,
            head_dim=self.model_channels // self.num_heads,
            h_extrapolation_ratio=self.rope_h_extrapolation_ratio,
            w_extrapolation_ratio=self.rope_w_extrapolation_ratio,
            t_extrapolation_ratio=self.rope_t_extrapolation_ratio,
            enable_fps_modulation=self.rope_enable_fps_modulation,
            device=device,
        )
        self.pos_embedder = cls_type(
            **kwargs,  # type: ignore
        )
        if self.extra_per_block_abs_pos_emb:
            kwargs["h_extrapolation_ratio"] = self.extra_h_extrapolation_ratio
            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
            kwargs["device"] = device
            kwargs["dtype"] = dtype
            self.extra_pos_embedder = LearnablePosEmbAxis(
                **kwargs,  # type: ignore
            )
    def prepare_embedded_sequence(
        self,
        x_B_C_T_H_W: torch.Tensor,
        fps: Optional[torch.Tensor] = None,
        padding_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
        Args:
            x_B_C_T_H_W (torch.Tensor): video
            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
                                    If None, a default value (`self.base_fps`) will be used.
            padding_mask (Optional[torch.Tensor]): current it is not used
        Returns:
            Tuple[torch.Tensor, Optional[torch.Tensor]]:
                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
                - An optional positional embedding tensor, returned only if the positional embedding class
                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
        Notes:
            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
                the `self.pos_embedder` with the shape [T, H, W].
            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the
            `self.pos_embedder` with the fps tensor.
            - Otherwise, the positional embeddings are generated without considering fps.
        """
        if self.concat_padding_mask:
            if padding_mask is None:
                padding_mask = torch.zeros(x_B_C_T_H_W.shape[0], 1, x_B_C_T_H_W.shape[3], x_B_C_T_H_W.shape[4], dtype=x_B_C_T_H_W.dtype, device=x_B_C_T_H_W.device)
            else:
                padding_mask = transforms.functional.resize(
                    padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
                )
            x_B_C_T_H_W = torch.cat(
                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
            )
        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
        if self.extra_per_block_abs_pos_emb:
            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
        else:
            extra_pos_emb = None
        if "rope" in self.pos_emb_cls.lower():
            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device), extra_pos_emb
        x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
        return x_B_T_H_W_D, None, extra_pos_emb
    def unpatchify(self, x_B_T_H_W_M: torch.Tensor) -> torch.Tensor:
        x_B_C_Tt_Hp_Wp = rearrange(
            x_B_T_H_W_M,
            "B T H W (p1 p2 t C) -> B C (T t) (H p1) (W p2)",
            p1=self.patch_spatial,
            p2=self.patch_spatial,
            t=self.patch_temporal,
        )
        return x_B_C_Tt_Hp_Wp
    def forward(
        self,
        x: torch.Tensor,
        timesteps: torch.Tensor,
        context: torch.Tensor,
        fps: Optional[torch.Tensor] = None,
        padding_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ):
        x_B_C_T_H_W = x
        timesteps_B_T = timesteps
        crossattn_emb = context
        """
        Args:
            x: (B, C, T, H, W) tensor of spatial-temp inputs
            timesteps: (B, ) tensor of timesteps
            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
        """
        x_B_T_H_W_D, rope_emb_L_1_1_D, extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = self.prepare_embedded_sequence(
            x_B_C_T_H_W,
            fps=fps,
            padding_mask=padding_mask,
        )
        if timesteps_B_T.ndim == 1:
            timesteps_B_T = timesteps_B_T.unsqueeze(1)
        t_embedding_B_T_D, adaln_lora_B_T_3D = self.t_embedder[1](self.t_embedder[0](timesteps_B_T).to(x_B_T_H_W_D.dtype))
        t_embedding_B_T_D = self.t_embedding_norm(t_embedding_B_T_D)
        # for logging purpose
        affline_scale_log_info = {}
        affline_scale_log_info["t_embedding_B_T_D"] = t_embedding_B_T_D.detach()
        self.affline_scale_log_info = affline_scale_log_info
        self.affline_emb = t_embedding_B_T_D
        self.crossattn_emb = crossattn_emb
        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
            assert (
                x_B_T_H_W_D.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
            ), f"{x_B_T_H_W_D.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape}"
        block_kwargs = {
            "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0),
            "adaln_lora_B_T_3D": adaln_lora_B_T_3D,
            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
        }
        for block in self.blocks:
            x_B_T_H_W_D = block(
                x_B_T_H_W_D,
                t_embedding_B_T_D,
                crossattn_emb,
                **block_kwargs,
            )
        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)
        return x_B_C_Tt_Hp_Wp
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@ -121,6 +121,11 @@ class ControlNetFlux(Flux):
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        if y is None:
            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
        else:
            y = y[:, :self.params.vec_in_dim]
        # running on sequences img
        img = self.img_in(img)
@ -174,7 +179,7 @@ class ControlNetFlux(Flux):
            out["output"] = out_output[:self.main_model_single]
        return out
-    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
+    def forward(self, x, timesteps, context, y=None, guidance=None, hint=None, **kwargs):
        patch_size = 2
        if self.latent_input:
            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -118,7 +118,7 @@ class Modulation(nn.Module):
 def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
    if modulation_dims is None:
        if m_add is not None:
-            return tensor * m_mult + m_add
+            return torch.addcmul(m_add, tensor, m_mult)
        else:
            return tensor * m_mult
    else:
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -101,6 +101,10 @@ class Flux(nn.Module):
        transformer_options={},
        attn_mask: Tensor = None,
    ) -> Tensor:
        if y is None:
            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
@ -155,6 +159,9 @@ class Flux(nn.Module):
                    if add is not None:
                        img += add
        if img.dtype == torch.float16:
            img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
        img = torch.cat((txt, img), 1)
        for i, block in enumerate(self.single_blocks):
@ -188,20 +195,50 @@ class Flux(nn.Module):
        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img
-    def forward(self, x, timestep, context, y, guidance=None, control=None, transformer_options={}, **kwargs):
+    def process_img(self, x, index=0, h_offset=0, w_offset=0):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
        w_offset = ((w_offset + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
    def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
        bs, c, h_orig, w_orig = x.shape
        patch_size = self.patch_size
        h_len = ((h_orig + (patch_size // 2)) // patch_size)
        w_len = ((w_orig + (patch_size // 2)) // patch_size)
        img, img_ids = self.process_img(x)
        img_tokens = img.shape[1]
        if ref_latents is not None:
            h = 0
            w = 0
            for ref in ref_latents:
                h_offset = 0
                w_offset = 0
                if ref.shape[-2] + h > ref.shape[-1] + w:
                    w_offset = w
                else:
                    h_offset = h
                kontext, kontext_ids = self.process_img(ref, index=1, h_offset=h_offset, w_offset=w_offset)
                img = torch.cat([img, kontext], dim=1)
                img_ids = torch.cat([img_ids, kontext_ids], dim=1)
                h = max(h, ref.shape[-2] + h_offset)
                w = max(w, ref.shape[-1] + w_offset)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
+        out = out[:, :img_tokens]
        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -261,8 +261,8 @@ class CrossAttention(nn.Module):
        self.heads = heads
        self.dim_head = dim_head
-        self.q_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.q_norm = operations.RMSNorm(inner_dim, eps=1e-5, dtype=dtype, device=device)
-        self.k_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.k_norm = operations.RMSNorm(inner_dim, eps=1e-5, dtype=dtype, device=device)
        self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -973,7 +973,7 @@ class VideoVAE(nn.Module):
            norm_layer=config.get("norm_layer", "group_norm"),
            causal=config.get("causal_decoder", False),
            timestep_conditioning=self.timestep_conditioning,
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+            spatial_padding_mode=config.get("spatial_padding_mode", "reflect"),
        )
        self.per_channel_statistics = processor()
--- a/comfy/ldm/models/autoencoder.py
+++ b/comfy/ldm/models/autoencoder.py
@ -11,7 +11,7 @@ from comfy.ldm.modules.ema import LitEma
 import comfy.ops
 class DiagonalGaussianRegularizer(torch.nn.Module):
-    def __init__(self, sample: bool = True):
+    def __init__(self, sample: bool = False):
        super().__init__()
        self.sample = sample
@ -19,16 +19,12 @@ class DiagonalGaussianRegularizer(torch.nn.Module):
        yield from ()
    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
        log = dict()
        posterior = DiagonalGaussianDistribution(z)
        if self.sample:
            z = posterior.sample()
        else:
            z = posterior.mode()
-        kl_loss = posterior.kl()
+        return z, None
        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
        log["kl_loss"] = kl_loss
        return z, log
 class AbstractAutoencoder(torch.nn.Module):
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -753,7 +753,7 @@ class BasicTransformerBlock(nn.Module):
            for p in patch:
                n = p(n, extra_options)
-        x += n
+        x = n + x
        if "middle_patch" in transformer_patches:
            patch = transformer_patches["middle_patch"]
            for p in patch:
@ -793,12 +793,12 @@ class BasicTransformerBlock(nn.Module):
            for p in patch:
                n = p(n, extra_options)
-        x += n
+        x = n + x
        if self.is_res:
            x_skip = x
        x = self.ff(self.norm3(x))
        if self.is_res:
-            x += x_skip
+            x = x_skip + x
        return x
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@ -36,7 +36,7 @@ def get_timestep_embedding(timesteps, embedding_dim):
 def nonlinearity(x):
    # swish
-    return x*torch.sigmoid(x)
+    return torch.nn.functional.silu(x)
 def Normalize(in_channels, num_groups=32):
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
@ -31,7 +31,7 @@ def dynamic_slice(
    starts: List[int],
    sizes: List[int],
 ) -> Tensor:
-    slicing = [slice(start, start + size) for start, size in zip(starts, sizes)]
+    slicing = tuple(slice(start, start + size) for start, size in zip(starts, sizes))
    return x[slicing]
 class AttnChunk(NamedTuple):
--- a/comfy/ldm/omnigen/omnigen2.py
+++ b/comfy/ldm/omnigen/omnigen2.py
@ -0,0 +1,469 @@
 # Original code: https://github.com/VectorSpaceLab/OmniGen2
 from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from comfy.ldm.lightricks.model import Timesteps
 from comfy.ldm.flux.layers import EmbedND
 from comfy.ldm.modules.attention import optimized_attention_masked
 import comfy.model_management
 import comfy.ldm.common_dit
 def apply_rotary_emb(x, freqs_cis):
    if x.shape[1] == 0:
        return x
    t_ = x.reshape(*x.shape[:-1], -1, 1, 2)
    t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
    return t_out.reshape(*x.shape).to(dtype=x.dtype)
 def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    return F.silu(x) * y
 class TimestepEmbedding(nn.Module):
    def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device)
        self.act = nn.SiLU()
        self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device)
    def forward(self, sample: torch.Tensor) -> torch.Tensor:
        sample = self.linear_1(sample)
        sample = self.act(sample)
        sample = self.linear_2(sample)
        return sample
 class LuminaRMSNormZero(nn.Module):
    def __init__(self, embedding_dim: int, norm_eps: float = 1e-5, dtype=None, device=None, operations=None):
        super().__init__()
        self.silu = nn.SiLU()
        self.linear = operations.Linear(min(embedding_dim, 1024), 4 * embedding_dim, dtype=dtype, device=device)
        self.norm = operations.RMSNorm(embedding_dim, eps=norm_eps, dtype=dtype, device=device)
    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        emb = self.linear(self.silu(emb))
        scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
        x = self.norm(x) * (1 + scale_msa[:, None])
        return x, gate_msa, scale_mlp, gate_mlp
 class LuminaLayerNormContinuous(nn.Module):
    def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, elementwise_affine: bool = False, eps: float = 1e-6, out_dim: Optional[int] = None, dtype=None, device=None, operations=None):
        super().__init__()
        self.silu = nn.SiLU()
        self.linear_1 = operations.Linear(conditioning_embedding_dim, embedding_dim, dtype=dtype, device=device)
        self.norm = operations.LayerNorm(embedding_dim, eps, elementwise_affine, dtype=dtype, device=device)
        self.linear_2 = operations.Linear(embedding_dim, out_dim, bias=True, dtype=dtype, device=device) if out_dim is not None else None
    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
        emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
        x = self.norm(x) * (1 + emb)[:, None, :]
        if self.linear_2 is not None:
            x = self.linear_2(x)
        return x
 class LuminaFeedForward(nn.Module):
    def __init__(self, dim: int, inner_dim: int, multiple_of: int = 256, dtype=None, device=None, operations=None):
        super().__init__()
        inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
        self.linear_1 = operations.Linear(dim, inner_dim, bias=False, dtype=dtype, device=device)
        self.linear_2 = operations.Linear(inner_dim, dim, bias=False, dtype=dtype, device=device)
        self.linear_3 = operations.Linear(dim, inner_dim, bias=False, dtype=dtype, device=device)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h1, h2 = self.linear_1(x), self.linear_3(x)
        return self.linear_2(swiglu(h1, h2))
 class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
    def __init__(self, hidden_size: int = 4096, text_feat_dim: int = 2048, frequency_embedding_size: int = 256, norm_eps: float = 1e-5, timestep_scale: float = 1.0, dtype=None, device=None, operations=None):
        super().__init__()
        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=timestep_scale)
        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024), dtype=dtype, device=device, operations=operations)
        self.caption_embedder = nn.Sequential(
            operations.RMSNorm(text_feat_dim, eps=norm_eps, dtype=dtype, device=device),
            operations.Linear(text_feat_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
    def forward(self, timestep: torch.Tensor, text_hidden_states: torch.Tensor, dtype: torch.dtype) -> Tuple[torch.Tensor, torch.Tensor]:
        timestep_proj = self.time_proj(timestep).to(dtype=dtype)
        time_embed = self.timestep_embedder(timestep_proj)
        caption_embed = self.caption_embedder(text_hidden_states)
        return time_embed, caption_embed
 class Attention(nn.Module):
    def __init__(self, query_dim: int, dim_head: int, heads: int, kv_heads: int, eps: float = 1e-5, bias: bool = False, dtype=None, device=None, operations=None):
        super().__init__()
        self.heads = heads
        self.kv_heads = kv_heads
        self.dim_head = dim_head
        self.scale = dim_head ** -0.5
        self.to_q = operations.Linear(query_dim, heads * dim_head, bias=bias, dtype=dtype, device=device)
        self.to_k = operations.Linear(query_dim, kv_heads * dim_head, bias=bias, dtype=dtype, device=device)
        self.to_v = operations.Linear(query_dim, kv_heads * dim_head, bias=bias, dtype=dtype, device=device)
        self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
        self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
        self.to_out = nn.Sequential(
            operations.Linear(heads * dim_head, query_dim, bias=bias, dtype=dtype, device=device),
            nn.Dropout(0.0)
        )
    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, image_rotary_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
        batch_size, sequence_length, _ = hidden_states.shape
        query = self.to_q(hidden_states)
        key = self.to_k(encoder_hidden_states)
        value = self.to_v(encoder_hidden_states)
        query = query.view(batch_size, -1, self.heads, self.dim_head)
        key = key.view(batch_size, -1, self.kv_heads, self.dim_head)
        value = value.view(batch_size, -1, self.kv_heads, self.dim_head)
        query = self.norm_q(query)
        key = self.norm_k(key)
        if image_rotary_emb is not None:
            query = apply_rotary_emb(query, image_rotary_emb)
            key = apply_rotary_emb(key, image_rotary_emb)
        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)
        if self.kv_heads < self.heads:
            key = key.repeat_interleave(self.heads // self.kv_heads, dim=1)
            value = value.repeat_interleave(self.heads // self.kv_heads, dim=1)
        hidden_states = optimized_attention_masked(query, key, value, self.heads, attention_mask, skip_reshape=True)
        hidden_states = self.to_out[0](hidden_states)
        return hidden_states
 class OmniGen2TransformerBlock(nn.Module):
    def __init__(self, dim: int, num_attention_heads: int, num_kv_heads: int, multiple_of: int, ffn_dim_multiplier: float, norm_eps: float, modulation: bool = True, dtype=None, device=None, operations=None):
        super().__init__()
        self.modulation = modulation
        self.attn = Attention(
            query_dim=dim,
            dim_head=dim // num_attention_heads,
            heads=num_attention_heads,
            kv_heads=num_kv_heads,
            eps=1e-5,
            bias=False,
            dtype=dtype, device=device, operations=operations,
        )
        self.feed_forward = LuminaFeedForward(
            dim=dim,
            inner_dim=4 * dim,
            multiple_of=multiple_of,
            dtype=dtype, device=device, operations=operations
        )
        if modulation:
            self.norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
        else:
            self.norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
        self.ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
        self.norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
        self.ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, image_rotary_emb: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
        if self.modulation:
            norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb)
            hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
            hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
        else:
            norm_hidden_states = self.norm1(hidden_states)
            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb)
            hidden_states = hidden_states + self.norm2(attn_output)
            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
            hidden_states = hidden_states + self.ffn_norm2(mlp_output)
        return hidden_states
 class OmniGen2RotaryPosEmbed(nn.Module):
    def __init__(self, theta: int, axes_dim: Tuple[int, int, int], axes_lens: Tuple[int, int, int] = (300, 512, 512), patch_size: int = 2):
        super().__init__()
        self.theta = theta
        self.axes_dim = axes_dim
        self.axes_lens = axes_lens
        self.patch_size = patch_size
        self.rope_embedder = EmbedND(dim=sum(axes_dim), theta=self.theta, axes_dim=axes_dim)
    def forward(self, batch_size, encoder_seq_len, l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, ref_img_sizes, img_sizes, device):
        p = self.patch_size
        seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]
        max_seq_len = max(seq_lengths)
        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
        max_img_len = max(l_effective_img_len)
        position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
            position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")
            pe_shift = cap_seq_len
            pe_shift_len = cap_seq_len
            if ref_img_sizes[i] is not None:
                for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
                    H, W = ref_img_size
                    ref_H_tokens, ref_W_tokens = H // p, W // p
                    row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
                    col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
                    pe_shift += max(ref_H_tokens, ref_W_tokens)
                    pe_shift_len += ref_img_len
            H, W = img_sizes[i]
            H_tokens, W_tokens = H // p, W // p
            row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
            col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()
            position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
            position_ids[i, pe_shift_len: seq_len, 1] = row_ids
            position_ids[i, pe_shift_len: seq_len, 2] = col_ids
        freqs_cis = self.rope_embedder(position_ids).movedim(1, 2)
        cap_freqs_cis_shape = list(freqs_cis.shape)
        cap_freqs_cis_shape[1] = encoder_seq_len
        cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
        ref_img_freqs_cis_shape = list(freqs_cis.shape)
        ref_img_freqs_cis_shape[1] = max_ref_img_len
        ref_img_freqs_cis = torch.zeros(*ref_img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
        img_freqs_cis_shape = list(freqs_cis.shape)
        img_freqs_cis_shape[1] = max_img_len
        img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
        for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
            ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
        return cap_freqs_cis, ref_img_freqs_cis, img_freqs_cis, freqs_cis, l_effective_cap_len, seq_lengths
 class OmniGen2Transformer2DModel(nn.Module):
    def __init__(
        self,
        patch_size: int = 2,
        in_channels: int = 16,
        out_channels: Optional[int] = None,
        hidden_size: int = 2304,
        num_layers: int = 26,
        num_refiner_layers: int = 2,
        num_attention_heads: int = 24,
        num_kv_heads: int = 8,
        multiple_of: int = 256,
        ffn_dim_multiplier: Optional[float] = None,
        norm_eps: float = 1e-5,
        axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
        axes_lens: Tuple[int, int, int] = (300, 512, 512),
        text_feat_dim: int = 1024,
        timestep_scale: float = 1.0,
        image_model=None,
        device=None,
        dtype=None,
        operations=None,
    ):
        super().__init__()
        self.patch_size = patch_size
        self.out_channels = out_channels or in_channels
        self.hidden_size = hidden_size
        self.dtype = dtype
        self.rope_embedder = OmniGen2RotaryPosEmbed(
            theta=10000,
            axes_dim=axes_dim_rope,
            axes_lens=axes_lens,
            patch_size=patch_size,
        )
        self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
        self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
            hidden_size=hidden_size,
            text_feat_dim=text_feat_dim,
            norm_eps=norm_eps,
            timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
        )
        self.noise_refiner = nn.ModuleList([
            OmniGen2TransformerBlock(
                hidden_size, num_attention_heads, num_kv_heads,
                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations
            ) for _ in range(num_refiner_layers)
        ])
        self.ref_image_refiner = nn.ModuleList([
            OmniGen2TransformerBlock(
                hidden_size, num_attention_heads, num_kv_heads,
                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations
            ) for _ in range(num_refiner_layers)
        ])
        self.context_refiner = nn.ModuleList([
            OmniGen2TransformerBlock(
                hidden_size, num_attention_heads, num_kv_heads,
                multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations
            ) for _ in range(num_refiner_layers)
        ])
        self.layers = nn.ModuleList([
            OmniGen2TransformerBlock(
                hidden_size, num_attention_heads, num_kv_heads,
                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations
            ) for _ in range(num_layers)
        ])
        self.norm_out = LuminaLayerNormContinuous(
            embedding_dim=hidden_size,
            conditioning_embedding_dim=min(hidden_size, 1024),
            elementwise_affine=False,
            eps=1e-6,
            out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
        )
        self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
    def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
        batch_size = len(hidden_states)
        p = self.patch_size
        img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
        l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
        if ref_image_hidden_states is not None:
            ref_image_hidden_states = list(map(lambda ref: comfy.ldm.common_dit.pad_to_patch_size(ref, (p, p)), ref_image_hidden_states))
            ref_img_sizes = [[(imgs.size(2), imgs.size(3)) if imgs is not None else None for imgs in ref_image_hidden_states]] * batch_size
            l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
        else:
            ref_img_sizes = [None for _ in range(batch_size)]
            l_effective_ref_img_len = [[0] for _ in range(batch_size)]
        flat_ref_img_hidden_states = None
        if ref_image_hidden_states is not None:
            imgs = []
            for ref_img in ref_image_hidden_states:
                B, C, H, W = ref_img.size()
                ref_img = rearrange(ref_img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p)
                imgs.append(ref_img)
            flat_ref_img_hidden_states = torch.cat(imgs, dim=1)
        img = hidden_states
        B, C, H, W = img.size()
        flat_hidden_states = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p)
        return (
            flat_hidden_states, flat_ref_img_hidden_states,
            None, None,
            l_effective_ref_img_len, l_effective_img_len,
            ref_img_sizes, img_sizes,
        )
    def img_patch_embed_and_refine(self, hidden_states, ref_image_hidden_states, padded_img_mask, padded_ref_img_mask, noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb):
        batch_size = len(hidden_states)
        hidden_states = self.x_embedder(hidden_states)
        if ref_image_hidden_states is not None:
            ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
            image_index_embedding = comfy.model_management.cast_to(self.image_index_embedding, dtype=hidden_states.dtype, device=hidden_states.device)
            for i in range(batch_size):
                shift = 0
                for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
                    ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + image_index_embedding[j]
                    shift += ref_img_len
        for layer in self.noise_refiner:
            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
        if ref_image_hidden_states is not None:
            for layer in self.ref_image_refiner:
                ref_image_hidden_states = layer(ref_image_hidden_states, padded_ref_img_mask, ref_img_rotary_emb, temb)
            hidden_states = torch.cat([ref_image_hidden_states, hidden_states], dim=1)
        return hidden_states
    def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, **kwargs):
        B, C, H, W = x.shape
        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        _, _, H_padded, W_padded = hidden_states.shape
        timestep = 1.0 - timesteps
        text_hidden_states = context
        text_attention_mask = attention_mask
        ref_image_hidden_states = ref_latents
        device = hidden_states.device
        temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
        (
            hidden_states, ref_image_hidden_states,
            img_mask, ref_img_mask,
            l_effective_ref_img_len, l_effective_img_len,
            ref_img_sizes, img_sizes,
        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
        (
            context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
            rotary_emb, encoder_seq_lengths, seq_lengths,
        ) = self.rope_embedder(
            hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
            l_effective_ref_img_len, l_effective_img_len,
            ref_img_sizes, img_sizes, device,
        )
        for layer in self.context_refiner:
            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
        img_len = hidden_states.shape[1]
        combined_img_hidden_states = self.img_patch_embed_and_refine(
            hidden_states, ref_image_hidden_states,
            img_mask, ref_img_mask,
            noise_rotary_emb, ref_img_rotary_emb,
            l_effective_ref_img_len, l_effective_img_len,
            temb,
        )
        hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
        attention_mask = None
        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
        hidden_states = self.norm_out(hidden_states, temb)
        p = self.patch_size
        output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)',  h=H_padded // p, w=W_padded// p, p1=p, p2=p)[:, :, :H, :W]
        return -output
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -146,6 +146,15 @@ WAN_CROSSATTENTION_CLASSES = {
 }
 def repeat_e(e, x):
    repeats = 1
    if e.shape[1] > 1:
        repeats = x.shape[1] // e.shape[1]
    if repeats == 1:
        return e
    return torch.repeat_interleave(e, repeats, dim=1)
 class WanAttentionBlock(nn.Module):
    def __init__(self,
@ -202,20 +211,23 @@ class WanAttentionBlock(nn.Module):
        """
        # assert e.dtype == torch.float32
        if e.ndim < 4:
            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
        else:
            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2)
        # assert e[0].dtype == torch.float32
        # self-attention
        y = self.self_attn(
-            self.norm1(x) * (1 + e[1]) + e[0],
+            self.norm1(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x),
            freqs)
-        x = x + y * e[2]
+        x = x + y * repeat_e(e[2], x)
        # cross-attention & ffn
        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len)
-        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
+        y = self.ffn(self.norm2(x) * (1 + repeat_e(e[4], x)) + repeat_e(e[3], x))
-        x = x + y * e[5]
+        x = x + y * repeat_e(e[5], x)
        return x
@ -325,8 +337,12 @@ class Head(nn.Module):
            e(Tensor): Shape [B, C]
        """
        # assert e.dtype == torch.float32
        if e.ndim < 3:
            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1)
-        x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        else:
            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2)
        x = (self.head(self.norm(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x)))
        return x
@ -506,8 +522,9 @@ class WanModel(torch.nn.Module):
        # time embeddings
        e = self.time_embedding(
-            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
-        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        e = e.reshape(t.shape[0], -1, e.shape[-1])
        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
        # context
        context = self.text_embedding(context)
@ -752,8 +769,7 @@ class CameraWanModel(WanModel):
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        if self.control_adapter is not None and camera_conditions is not None:
-            x_camera = self.control_adapter(camera_conditions).to(x.dtype)
+            x = x + self.control_adapter(camera_conditions).to(x.dtype)
            x = x + x_camera
        grid_sizes = x.shape[2:]
        x = x.flatten(2).transpose(1, 2)
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@ -24,12 +24,17 @@ class CausalConv3d(ops.Conv3d):
                         self.padding[1], 2 * self.padding[0], 0)
        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
+    def forward(self, x, cache_x=None, cache_list=None, cache_idx=None):
        if cache_list is not None:
            cache_x = cache_list[cache_idx]
            cache_list[cache_idx] = None
        padding = list(self._padding)
        if cache_x is not None and self._padding[4] > 0:
            cache_x = cache_x.to(x.device)
            x = torch.cat([cache_x, x], dim=2)
            padding[4] -= cache_x.shape[2]
            del cache_x
        x = F.pad(x, padding)
        return super().forward(x)
@ -52,15 +57,6 @@ class RMS_norm(nn.Module):
            x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0)
 class Upsample(nn.Upsample):
    def forward(self, x):
        """
        Fix bfloat16 support for nearest neighbor interpolation.
        """
        return super().forward(x.float()).type_as(x)
 class Resample(nn.Module):
    def __init__(self, dim, mode):
@ -73,11 +69,11 @@ class Resample(nn.Module):
        # layers
        if mode == 'upsample2d':
            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                ops.Conv2d(dim, dim // 2, 3, padding=1))
        elif mode == 'upsample3d':
            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                ops.Conv2d(dim, dim // 2, 3, padding=1))
            self.time_conv = CausalConv3d(
                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
@ -157,29 +153,6 @@ class Resample(nn.Module):
                    feat_idx[0] += 1
        return x
    def init_weight(self, conv):
        conv_weight = conv.weight
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        one_matrix = torch.eye(c1, c2)
        init_matrix = one_matrix
        nn.init.zeros_(conv_weight)
        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
    def init_weight2(self, conv):
        conv_weight = conv.weight.data
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        init_matrix = torch.eye(c1 // 2, c2)
        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
 class ResidualBlock(nn.Module):
@ -198,7 +171,7 @@ class ResidualBlock(nn.Module):
            if in_dim != out_dim else nn.Identity()
    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
+        old_x = x
        for layer in self.residual:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
@ -210,12 +183,12 @@ class ResidualBlock(nn.Module):
                            cache_x.device), cache_x
                    ],
                                        dim=2)
-                x = layer(x, feat_cache[idx])
+                x = layer(x, cache_list=feat_cache, cache_idx=idx)
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
-        return x + h
+        return x + self.shortcut(old_x)
 class AttentionBlock(nn.Module):
@ -494,12 +467,6 @@ class WanVAE(nn.Module):
        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
                                 attn_scales, self.temperal_upsample, dropout)
    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x_recon = self.decode(z)
        return x_recon, mu, log_var
    def encode(self, x):
        self.clear_cache()
        ## cache
@ -545,18 +512,6 @@ class WanVAE(nn.Module):
        self.clear_cache()
        return out
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps * std + mu
    def sample(self, imgs, deterministic=False):
        mu, log_var = self.encode(imgs)
        if deterministic:
            return mu
        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
        return mu + std * torch.randn_like(std)
    def clear_cache(self):
        self._conv_num = count_conv3d(self.decoder)
        self._conv_idx = [0]
--- a/comfy/ldm/wan/vae2_2.py
+++ b/comfy/ldm/wan/vae2_2.py
@ -0,0 +1,726 @@
 # original version: https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/vae2_2.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from .vae import AttentionBlock, CausalConv3d, RMS_norm
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 CACHE_T = 2
 class Resample(nn.Module):
    def __init__(self, dim, mode):
        assert mode in (
            "none",
            "upsample2d",
            "upsample3d",
            "downsample2d",
            "downsample3d",
        )
        super().__init__()
        self.dim = dim
        self.mode = mode
        # layers
        if mode == "upsample2d":
            self.resample = nn.Sequential(
                nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
                ops.Conv2d(dim, dim, 3, padding=1),
            )
        elif mode == "upsample3d":
            self.resample = nn.Sequential(
                nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
                ops.Conv2d(dim, dim, 3, padding=1),
                # ops.Conv2d(dim, dim//2, 3, padding=1)
            )
            self.time_conv = CausalConv3d(
                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
        elif mode == "downsample2d":
            self.resample = nn.Sequential(
                nn.ZeroPad2d((0, 1, 0, 1)),
                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
        elif mode == "downsample3d":
            self.resample = nn.Sequential(
                nn.ZeroPad2d((0, 1, 0, 1)),
                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
            self.time_conv = CausalConv3d(
                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
        else:
            self.resample = nn.Identity()
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        b, c, t, h, w = x.size()
        if self.mode == "upsample3d":
            if feat_cache is not None:
                idx = feat_idx[0]
                if feat_cache[idx] is None:
                    feat_cache[idx] = "Rep"
                    feat_idx[0] += 1
                else:
                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
                            feat_cache[idx] != "Rep"):
                        # cache last frame of last two chunk
                        cache_x = torch.cat(
                            [
                                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                                    cache_x.device),
                                cache_x,
                            ],
                            dim=2,
                        )
                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
                            feat_cache[idx] == "Rep"):
                        cache_x = torch.cat(
                            [
                                torch.zeros_like(cache_x).to(cache_x.device),
                                cache_x
                            ],
                            dim=2,
                        )
                    if feat_cache[idx] == "Rep":
                        x = self.time_conv(x)
                    else:
                        x = self.time_conv(x, feat_cache[idx])
                    feat_cache[idx] = cache_x
                    feat_idx[0] += 1
                    x = x.reshape(b, 2, c, t, h, w)
                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
                                    3)
                    x = x.reshape(b, c, t * 2, h, w)
        t = x.shape[2]
        x = rearrange(x, "b c t h w -> (b t) c h w")
        x = self.resample(x)
        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
        if self.mode == "downsample3d":
            if feat_cache is not None:
                idx = feat_idx[0]
                if feat_cache[idx] is None:
                    feat_cache[idx] = x.clone()
                    feat_idx[0] += 1
                else:
                    cache_x = x[:, :, -1:, :, :].clone()
                    x = self.time_conv(
                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
                    feat_cache[idx] = cache_x
                    feat_idx[0] += 1
        return x
 class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.0):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        # layers
        self.residual = nn.Sequential(
            RMS_norm(in_dim, images=False),
            nn.SiLU(),
            CausalConv3d(in_dim, out_dim, 3, padding=1),
            RMS_norm(out_dim, images=False),
            nn.SiLU(),
            nn.Dropout(dropout),
            CausalConv3d(out_dim, out_dim, 3, padding=1),
        )
        self.shortcut = (
            CausalConv3d(in_dim, out_dim, 1)
            if in_dim != out_dim else nn.Identity())
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        old_x = x
        for layer in self.residual:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    # cache last frame of last two chunk
                    cache_x = torch.cat(
                        [
                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                                cache_x.device),
                            cache_x,
                        ],
                        dim=2,
                    )
                x = layer(x, cache_list=feat_cache, cache_idx=idx)
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x + self.shortcut(old_x)
 def patchify(x, patch_size):
    if patch_size == 1:
        return x
    if x.dim() == 4:
        x = rearrange(
            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
    elif x.dim() == 5:
        x = rearrange(
            x,
            "b c f (h q) (w r) -> b (c r q) f h w",
            q=patch_size,
            r=patch_size,
        )
    else:
        raise ValueError(f"Invalid input shape: {x.shape}")
    return x
 def unpatchify(x, patch_size):
    if patch_size == 1:
        return x
    if x.dim() == 4:
        x = rearrange(
            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
    elif x.dim() == 5:
        x = rearrange(
            x,
            "b (c r q) f h w -> b c f (h q) (w r)",
            q=patch_size,
            r=patch_size,
        )
    return x
 class AvgDown3D(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        factor_t,
        factor_s=1,
    ):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.factor_t = factor_t
        self.factor_s = factor_s
        self.factor = self.factor_t * self.factor_s * self.factor_s
        assert in_channels * self.factor % out_channels == 0
        self.group_size = in_channels * self.factor // out_channels
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
        pad = (0, 0, 0, 0, pad_t, 0)
        x = F.pad(x, pad)
        B, C, T, H, W = x.shape
        x = x.view(
            B,
            C,
            T // self.factor_t,
            self.factor_t,
            H // self.factor_s,
            self.factor_s,
            W // self.factor_s,
            self.factor_s,
        )
        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
        x = x.view(
            B,
            C * self.factor,
            T // self.factor_t,
            H // self.factor_s,
            W // self.factor_s,
        )
        x = x.view(
            B,
            self.out_channels,
            self.group_size,
            T // self.factor_t,
            H // self.factor_s,
            W // self.factor_s,
        )
        x = x.mean(dim=2)
        return x
 class DupUp3D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        factor_t,
        factor_s=1,
    ):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.factor_t = factor_t
        self.factor_s = factor_s
        self.factor = self.factor_t * self.factor_s * self.factor_s
        assert out_channels * self.factor % in_channels == 0
        self.repeats = out_channels * self.factor // in_channels
    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
        x = x.repeat_interleave(self.repeats, dim=1)
        x = x.view(
            x.size(0),
            self.out_channels,
            self.factor_t,
            self.factor_s,
            self.factor_s,
            x.size(2),
            x.size(3),
            x.size(4),
        )
        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
        x = x.view(
            x.size(0),
            self.out_channels,
            x.size(2) * self.factor_t,
            x.size(4) * self.factor_s,
            x.size(6) * self.factor_s,
        )
        if first_chunk:
            x = x[:, :, self.factor_t - 1:, :, :]
        return x
 class Down_ResidualBlock(nn.Module):
    def __init__(self,
                 in_dim,
                 out_dim,
                 dropout,
                 mult,
                 temperal_downsample=False,
                 down_flag=False):
        super().__init__()
        # Shortcut path with downsample
        self.avg_shortcut = AvgDown3D(
            in_dim,
            out_dim,
            factor_t=2 if temperal_downsample else 1,
            factor_s=2 if down_flag else 1,
        )
        # Main path with residual blocks and downsample
        downsamples = []
        for _ in range(mult):
            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
            in_dim = out_dim
        # Add the final downsample block
        if down_flag:
            mode = "downsample3d" if temperal_downsample else "downsample2d"
            downsamples.append(Resample(out_dim, mode=mode))
        self.downsamples = nn.Sequential(*downsamples)
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        x_copy = x
        for module in self.downsamples:
            x = module(x, feat_cache, feat_idx)
        return x + self.avg_shortcut(x_copy)
 class Up_ResidualBlock(nn.Module):
    def __init__(self,
                 in_dim,
                 out_dim,
                 dropout,
                 mult,
                 temperal_upsample=False,
                 up_flag=False):
        super().__init__()
        # Shortcut path with upsample
        if up_flag:
            self.avg_shortcut = DupUp3D(
                in_dim,
                out_dim,
                factor_t=2 if temperal_upsample else 1,
                factor_s=2 if up_flag else 1,
            )
        else:
            self.avg_shortcut = None
        # Main path with residual blocks and upsample
        upsamples = []
        for _ in range(mult):
            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
            in_dim = out_dim
        # Add the final upsample block
        if up_flag:
            mode = "upsample3d" if temperal_upsample else "upsample2d"
            upsamples.append(Resample(out_dim, mode=mode))
        self.upsamples = nn.Sequential(*upsamples)
    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
        x_main = x
        for module in self.upsamples:
            x_main = module(x_main, feat_cache, feat_idx)
        if self.avg_shortcut is not None:
            x_shortcut = self.avg_shortcut(x, first_chunk)
            return x_main + x_shortcut
        else:
            return x_main
 class Encoder3d(nn.Module):
    def __init__(
        self,
        dim=128,
        z_dim=4,
        dim_mult=[1, 2, 4, 4],
        num_res_blocks=2,
        attn_scales=[],
        temperal_downsample=[True, True, False],
        dropout=0.0,
    ):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_downsample = temperal_downsample
        # dimensions
        dims = [dim * u for u in [1] + dim_mult]
        scale = 1.0
        # init block
        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
        # downsample blocks
        downsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            t_down_flag = (
                temperal_downsample[i]
                if i < len(temperal_downsample) else False)
            downsamples.append(
                Down_ResidualBlock(
                    in_dim=in_dim,
                    out_dim=out_dim,
                    dropout=dropout,
                    mult=num_res_blocks,
                    temperal_downsample=t_down_flag,
                    down_flag=i != len(dim_mult) - 1,
                ))
            scale /= 2.0
        self.downsamples = nn.Sequential(*downsamples)
        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(out_dim, out_dim, dropout),
            AttentionBlock(out_dim),
            ResidualBlock(out_dim, out_dim, dropout),
        )
        # # output blocks
        self.head = nn.Sequential(
            RMS_norm(out_dim, images=False),
            nn.SiLU(),
            CausalConv3d(out_dim, z_dim, 3, padding=1),
        )
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        if feat_cache is not None:
            idx = feat_idx[0]
            cache_x = x[:, :, -CACHE_T:, :, :].clone()
            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                cache_x = torch.cat(
                    [
                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                            cache_x.device),
                        cache_x,
                    ],
                    dim=2,
                )
            x = self.conv1(x, feat_cache[idx])
            feat_cache[idx] = cache_x
            feat_idx[0] += 1
        else:
            x = self.conv1(x)
        ## downsamples
        for layer in self.downsamples:
            if feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## middle
        for layer in self.middle:
            if isinstance(layer, ResidualBlock) and feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## head
        for layer in self.head:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    cache_x = torch.cat(
                        [
                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                                cache_x.device),
                            cache_x,
                        ],
                        dim=2,
                    )
                x = layer(x, feat_cache[idx])
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x
 class Decoder3d(nn.Module):
    def __init__(
        self,
        dim=128,
        z_dim=4,
        dim_mult=[1, 2, 4, 4],
        num_res_blocks=2,
        attn_scales=[],
        temperal_upsample=[False, True, True],
        dropout=0.0,
    ):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_upsample = temperal_upsample
        # dimensions
        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
        # init block
        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(dims[0], dims[0], dropout),
            AttentionBlock(dims[0]),
            ResidualBlock(dims[0], dims[0], dropout),
        )
        # upsample blocks
        upsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            t_up_flag = temperal_upsample[i] if i < len(
                temperal_upsample) else False
            upsamples.append(
                Up_ResidualBlock(
                    in_dim=in_dim,
                    out_dim=out_dim,
                    dropout=dropout,
                    mult=num_res_blocks + 1,
                    temperal_upsample=t_up_flag,
                    up_flag=i != len(dim_mult) - 1,
                ))
        self.upsamples = nn.Sequential(*upsamples)
        # output blocks
        self.head = nn.Sequential(
            RMS_norm(out_dim, images=False),
            nn.SiLU(),
            CausalConv3d(out_dim, 12, 3, padding=1),
        )
    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
        if feat_cache is not None:
            idx = feat_idx[0]
            cache_x = x[:, :, -CACHE_T:, :, :].clone()
            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                cache_x = torch.cat(
                    [
                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                            cache_x.device),
                        cache_x,
                    ],
                    dim=2,
                )
            x = self.conv1(x, feat_cache[idx])
            feat_cache[idx] = cache_x
            feat_idx[0] += 1
        else:
            x = self.conv1(x)
        for layer in self.middle:
            if isinstance(layer, ResidualBlock) and feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## upsamples
        for layer in self.upsamples:
            if feat_cache is not None:
                x = layer(x, feat_cache, feat_idx, first_chunk)
            else:
                x = layer(x)
        ## head
        for layer in self.head:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    cache_x = torch.cat(
                        [
                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                                cache_x.device),
                            cache_x,
                        ],
                        dim=2,
                    )
                x = layer(x, feat_cache[idx])
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x
 def count_conv3d(model):
    count = 0
    for m in model.modules():
        if isinstance(m, CausalConv3d):
            count += 1
    return count
 class WanVAE(nn.Module):
    def __init__(
        self,
        dim=160,
        dec_dim=256,
        z_dim=16,
        dim_mult=[1, 2, 4, 4],
        num_res_blocks=2,
        attn_scales=[],
        temperal_downsample=[True, True, False],
        dropout=0.0,
    ):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_downsample = temperal_downsample
        self.temperal_upsample = temperal_downsample[::-1]
        # modules
        self.encoder = Encoder3d(
            dim,
            z_dim * 2,
            dim_mult,
            num_res_blocks,
            attn_scales,
            self.temperal_downsample,
            dropout,
        )
        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
        self.decoder = Decoder3d(
            dec_dim,
            z_dim,
            dim_mult,
            num_res_blocks,
            attn_scales,
            self.temperal_upsample,
            dropout,
        )
    def encode(self, x):
        self.clear_cache()
        x = patchify(x, patch_size=2)
        t = x.shape[2]
        iter_ = 1 + (t - 1) // 4
        for i in range(iter_):
            self._enc_conv_idx = [0]
            if i == 0:
                out = self.encoder(
                    x[:, :, :1, :, :],
                    feat_cache=self._enc_feat_map,
                    feat_idx=self._enc_conv_idx,
                )
            else:
                out_ = self.encoder(
                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
                    feat_cache=self._enc_feat_map,
                    feat_idx=self._enc_conv_idx,
                )
                out = torch.cat([out, out_], 2)
        mu, log_var = self.conv1(out).chunk(2, dim=1)
        self.clear_cache()
        return mu
    def decode(self, z):
        self.clear_cache()
        iter_ = z.shape[2]
        x = self.conv2(z)
        for i in range(iter_):
            self._conv_idx = [0]
            if i == 0:
                out = self.decoder(
                    x[:, :, i:i + 1, :, :],
                    feat_cache=self._feat_map,
                    feat_idx=self._conv_idx,
                    first_chunk=True,
                )
            else:
                out_ = self.decoder(
                    x[:, :, i:i + 1, :, :],
                    feat_cache=self._feat_map,
                    feat_idx=self._conv_idx,
                )
                out = torch.cat([out, out_], 2)
        out = unpatchify(out, patch_size=2)
        self.clear_cache()
        return out
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps * std + mu
    def sample(self, imgs, deterministic=False):
        mu, log_var = self.encode(imgs)
        if deterministic:
            return mu
        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
        return mu + std * torch.randn_like(std)
    def clear_cache(self):
        self._conv_num = count_conv3d(self.decoder)
        self._conv_idx = [0]
        self._feat_map = [None] * self._conv_num
        # cache encode
        self._enc_conv_num = count_conv3d(self.encoder)
        self._enc_conv_idx = [0]
        self._enc_feat_map = [None] * self._enc_conv_num
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -34,12 +34,14 @@ import comfy.ldm.flux.model
 import comfy.ldm.lightricks.model
 import comfy.ldm.hunyuan_video.model
 import comfy.ldm.cosmos.model
 import comfy.ldm.cosmos.predict2
 import comfy.ldm.lumina.model
 import comfy.ldm.wan.model
 import comfy.ldm.hunyuan3d.model
 import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.model_management
 import comfy.patcher_extension
@ -48,6 +50,7 @@ import comfy.ops
 from enum import Enum
 from . import utils
 import comfy.latent_formats
 import comfy.model_sampling
 import math
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@ -63,38 +66,39 @@ class ModelType(Enum):
    V_PREDICTION_CONTINUOUS = 7
    FLUX = 8
    IMG_TO_IMG = 9
-
+    FLOW_COSMOS = 10
 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
 def model_sampling(model_config, model_type):
-    s = ModelSamplingDiscrete
+    s = comfy.model_sampling.ModelSamplingDiscrete
    if model_type == ModelType.EPS:
-        c = EPS
+        c = comfy.model_sampling.EPS
    elif model_type == ModelType.V_PREDICTION:
-        c = V_PREDICTION
+        c = comfy.model_sampling.V_PREDICTION
    elif model_type == ModelType.V_PREDICTION_EDM:
-        c = V_PREDICTION
+        c = comfy.model_sampling.V_PREDICTION
-        s = ModelSamplingContinuousEDM
+        s = comfy.model_sampling.ModelSamplingContinuousEDM
    elif model_type == ModelType.FLOW:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingDiscreteFlow
    elif model_type == ModelType.STABLE_CASCADE:
-        c = EPS
+        c = comfy.model_sampling.EPS
-        s = StableCascadeSampling
+        s = comfy.model_sampling.StableCascadeSampling
    elif model_type == ModelType.EDM:
-        c = EDM
+        c = comfy.model_sampling.EDM
-        s = ModelSamplingContinuousEDM
+        s = comfy.model_sampling.ModelSamplingContinuousEDM
    elif model_type == ModelType.V_PREDICTION_CONTINUOUS:
-        c = V_PREDICTION
+        c = comfy.model_sampling.V_PREDICTION
-        s = ModelSamplingContinuousV
+        s = comfy.model_sampling.ModelSamplingContinuousV
    elif model_type == ModelType.FLUX:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingFlux
    elif model_type == ModelType.IMG_TO_IMG:
        c = comfy.model_sampling.IMG_TO_IMG
    elif model_type == ModelType.FLOW_COSMOS:
        c = comfy.model_sampling.COSMOS_RFLOW
        s = comfy.model_sampling.ModelSamplingCosmosRFlow
    class ModelSampling(s, c):
        pass
@ -102,10 +106,12 @@ def model_sampling(model_config, model_type):
    return ModelSampling(model_config)
-def convert_tensor(extra, dtype):
+def convert_tensor(extra, dtype, device):
    if hasattr(extra, "dtype"):
        if extra.dtype != torch.int and extra.dtype != torch.long:
-            extra = extra.to(dtype)
+            extra = extra.to(dtype=dtype, device=device)
        else:
            extra = extra.to(device=device)
    return extra
@ -165,20 +171,21 @@ class BaseModel(torch.nn.Module):
            dtype = self.manual_cast_dtype
        xc = xc.to(dtype)
        device = xc.device
        t = self.model_sampling.timestep(t).float()
        if context is not None:
-            context = context.to(dtype)
+            context = context.to(dtype=dtype, device=device)
        extra_conds = {}
        for o in kwargs:
            extra = kwargs[o]
            if hasattr(extra, "dtype"):
-                extra = convert_tensor(extra, dtype)
+                extra = convert_tensor(extra, dtype, device)
            elif isinstance(extra, list):
                ex = []
                for ext in extra:
-                    ex.append(convert_tensor(ext, dtype))
+                    ex.append(convert_tensor(ext, dtype, device))
                extra = ex
            extra_conds[o] = extra
@ -812,6 +819,7 @@ class PixArt(BaseModel):
 class Flux(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.flux.model.Flux):
        super().__init__(model_config, model_type, device=device, unet_model=unet_model)
        self.memory_usage_factor_conds = ("ref_latents",)
    def concat_cond(self, **kwargs):
        try:
@ -872,8 +880,23 @@ class Flux(BaseModel):
        guidance = kwargs.get("guidance", 3.5)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        ref_latents = kwargs.get("reference_latents", None)
        if ref_latents is not None:
            latents = []
            for lat in ref_latents:
                latents.append(self.process_latent_in(lat))
            out['ref_latents'] = comfy.conds.CONDList(latents)
        return out
    def extra_conds_shapes(self, **kwargs):
        out = {}
        ref_latents = kwargs.get("reference_latents", None)
        if ref_latents is not None:
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out
 class GenmoMochi(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.genmo.joint_model.asymm_models_joint.AsymmDiTJoint)
@ -998,6 +1021,45 @@ class CosmosVideo(BaseModel):
        latent_image = self.model_sampling.calculate_input(torch.tensor([sigma_noise_augmentation], device=latent_image.device, dtype=latent_image.dtype), latent_image)
        return latent_image * ((sigma ** 2 + self.model_sampling.sigma_data ** 2) ** 0.5)
 class CosmosPredict2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW_COSMOS, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.predict2.MiniTrainDIT)
        self.image_to_video = image_to_video
        if self.image_to_video:
            self.concat_keys = ("mask_inverted",)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if denoise_mask is not None:
            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
        out['fps'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", None))
        return out
    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
        if denoise_mask is None:
            return timestep
        if denoise_mask.ndim <= 4:
            return timestep
        condition_video_mask_B_1_T_1_1 = denoise_mask.mean(dim=[1, 3, 4], keepdim=True)
        c_noise_B_1_T_1_1 = 0.0 * (1.0 - condition_video_mask_B_1_T_1_1) + timestep.reshape(timestep.shape[0], 1, 1, 1, 1) * condition_video_mask_B_1_T_1_1
        out = c_noise_B_1_T_1_1.squeeze(dim=[1, 3, 4])
        return out
    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
        sigma = sigma.reshape([sigma.shape[0]] + [1] * (len(noise.shape) - 1))
        sigma_noise_augmentation = 0 #TODO
        if sigma_noise_augmentation != 0:
            latent_image = latent_image + noise
        latent_image = self.model_sampling.calculate_input(torch.tensor([sigma_noise_augmentation], device=latent_image.device, dtype=latent_image.dtype), latent_image)
        sigma = (sigma / (sigma + 1))
        return latent_image / (1.0 - sigma)
 class Lumina2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
@ -1038,6 +1100,7 @@ class WAN21(BaseModel):
                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
            image = utils.resize_to_batch_size(image, noise.shape[0])
        if extra_channels != image.shape[1] + 4:
            if not self.image_to_video or extra_channels == image.shape[1]:
                return image
@ -1123,6 +1186,31 @@ class WAN21_Camera(WAN21):
            out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions)
        return out
 class WAN22(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
        self.image_to_video = image_to_video
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if denoise_mask is not None:
            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
        return out
    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
        if denoise_mask is None:
            return timestep
        temp_ts = (torch.mean(denoise_mask[:, :, :, :, :], dim=(1, 3, 4), keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1)
        return temp_ts
    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
        return latent_image
 class Hunyuan3Dv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
@ -1188,3 +1276,33 @@ class ACEStep(BaseModel):
        out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
        return out
 class Omnigen2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel)
        self.memory_usage_factor_conds = ("ref_latents",)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            if torch.numel(attention_mask) != attention_mask.sum():
                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
            out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item()))
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        ref_latents = kwargs.get("reference_latents", None)
        if ref_latents is not None:
            latents = []
            for lat in ref_latents:
                latents.append(self.process_latent_in(lat))
            out['ref_latents'] = comfy.conds.CONDList(latents)
        return out
    def extra_conds_shapes(self, **kwargs):
        out = {}
        ref_latents = kwargs.get("reference_latents", None)
        if ref_latents is not None:
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -346,7 +346,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config = {}
        dit_config["image_model"] = "wan2.1"
        dim = state_dict['{}head.modulation'.format(key_prefix)].shape[-1]
        out_dim = state_dict['{}head.head.weight'.format(key_prefix)].shape[0] // 4
        dit_config["dim"] = dim
        dit_config["out_dim"] = out_dim
        dit_config["num_heads"] = dim // 128
        dit_config["ffn_dim"] = state_dict['{}blocks.0.ffn.0.weight'.format(key_prefix)].shape[0]
        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
@ -407,6 +409,78 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["text_emb_dim"] = 2048
        return dit_config
    if '{}blocks.0.mlp.layer1.weight'.format(key_prefix) in state_dict_keys:  # Cosmos predict2
        dit_config = {}
        dit_config["image_model"] = "cosmos_predict2"
        dit_config["max_img_h"] = 240
        dit_config["max_img_w"] = 240
        dit_config["max_frames"] = 128
        concat_padding_mask = True
        dit_config["in_channels"] = (state_dict['{}x_embedder.proj.1.weight'.format(key_prefix)].shape[1] // 4) - int(concat_padding_mask)
        dit_config["out_channels"] = 16
        dit_config["patch_spatial"] = 2
        dit_config["patch_temporal"] = 1
        dit_config["model_channels"] = state_dict['{}x_embedder.proj.1.weight'.format(key_prefix)].shape[0]
        dit_config["concat_padding_mask"] = concat_padding_mask
        dit_config["crossattn_emb_channels"] = 1024
        dit_config["pos_emb_cls"] = "rope3d"
        dit_config["pos_emb_learnable"] = True
        dit_config["pos_emb_interpolation"] = "crop"
        dit_config["min_fps"] = 1
        dit_config["max_fps"] = 30
        dit_config["use_adaln_lora"] = True
        dit_config["adaln_lora_dim"] = 256
        if dit_config["model_channels"] == 2048:
            dit_config["num_blocks"] = 28
            dit_config["num_heads"] = 16
        elif dit_config["model_channels"] == 5120:
            dit_config["num_blocks"] = 36
            dit_config["num_heads"] = 40
        if dit_config["in_channels"] == 16:
            dit_config["extra_per_block_abs_pos_emb"] = False
            dit_config["rope_h_extrapolation_ratio"] = 4.0
            dit_config["rope_w_extrapolation_ratio"] = 4.0
            dit_config["rope_t_extrapolation_ratio"] = 1.0
        elif dit_config["in_channels"] == 17: # img to video
            if dit_config["model_channels"] == 2048:
                dit_config["extra_per_block_abs_pos_emb"] = False
                dit_config["rope_h_extrapolation_ratio"] = 3.0
                dit_config["rope_w_extrapolation_ratio"] = 3.0
                dit_config["rope_t_extrapolation_ratio"] = 1.0
            elif dit_config["model_channels"] == 5120:
                dit_config["rope_h_extrapolation_ratio"] = 2.0
                dit_config["rope_w_extrapolation_ratio"] = 2.0
                dit_config["rope_t_extrapolation_ratio"] = 0.8333333333333334
        dit_config["extra_h_extrapolation_ratio"] = 1.0
        dit_config["extra_w_extrapolation_ratio"] = 1.0
        dit_config["extra_t_extrapolation_ratio"] = 1.0
        dit_config["rope_enable_fps_modulation"] = False
        return dit_config
    if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys:  # Omnigen2
        dit_config = {}
        dit_config["image_model"] = "omnigen2"
        dit_config["axes_dim_rope"] = [40, 40, 40]
        dit_config["axes_lens"] = [1024, 1664, 1664]
        dit_config["ffn_dim_multiplier"] = None
        dit_config["hidden_size"] = 2520
        dit_config["in_channels"] = 16
        dit_config["multiple_of"] = 256
        dit_config["norm_eps"] = 1e-05
        dit_config["num_attention_heads"] = 21
        dit_config["num_kv_heads"] = 7
        dit_config["num_layers"] = 32
        dit_config["num_refiner_layers"] = 2
        dit_config["out_channels"] = None
        dit_config["patch_size"] = 2
        dit_config["text_feat_dim"] = 2048
        dit_config["timestep_scale"] = 1000.0
        return dit_config
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -101,7 +101,7 @@ if args.directml is not None:
    lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
 try:
-    import intel_extension_for_pytorch as ipex
+    import intel_extension_for_pytorch as ipex  # noqa: F401
    _ = torch.xpu.device_count()
    xpu_available = xpu_available or torch.xpu.is_available()
 except:
@ -128,6 +128,11 @@ try:
 except:
    mlu_available = False
 try:
    ixuca_available = hasattr(torch, "corex")
 except:
    ixuca_available = False
 if args.cpu:
    cpu_state = CPUState.CPU
@ -151,6 +156,12 @@ def is_mlu():
        return True
    return False
 def is_ixuca():
    global ixuca_available
    if ixuca_available:
        return True
    return False
 def get_torch_device():
    global directml_enabled
    global cpu_state
@ -186,8 +197,9 @@ def get_total_memory(dev=None, torch_total_too=False):
        elif is_intel_xpu():
            stats = torch.xpu.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
            mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
            mem_total_torch = mem_reserved
-            mem_total = torch.xpu.get_device_properties(dev).total_memory
+            mem_total = mem_total_xpu
        elif is_ascend_npu():
            stats = torch.npu.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
@ -288,13 +300,14 @@ try:
        if torch_version_numeric[0] >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu() or is_ascend_npu() or is_mlu():
+    if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca():
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
            ENABLE_PYTORCH_ATTENTION = True
 except:
    pass
 SUPPORT_FP8_OPS = args.supports_fp8_compute
 try:
    if is_amd():
        try:
@ -305,9 +318,16 @@ try:
        logging.info("AMD arch: {}".format(arch))
        logging.info("ROCm version: {}".format(rocm_version))
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
-            if torch_version_numeric[0] >= 2 and torch_version_numeric[1] >= 7:  # works on 2.6 but doesn't actually seem to improve much
+            if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches
+                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
                    ENABLE_PYTORCH_ATTENTION = True
            if torch_version_numeric >= (2, 8):
                if any((a in arch) for a in ["gfx1201"]):
                    ENABLE_PYTORCH_ATTENTION = True
        if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
                SUPPORT_FP8_OPS = True
 except:
    pass
@ -328,7 +348,7 @@ except:
    pass
 try:
-    if torch_version_numeric[0] == 2 and torch_version_numeric[1] >= 5:
+    if torch_version_numeric >= (2, 5):
        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
 except:
    logging.warning("Warning, could not set allow_fp16_bf16_reduction_math_sdp")
@ -372,6 +392,8 @@ def get_torch_device_name(device):
            except:
                allocator_backend = ""
            return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend)
        elif device.type == "xpu":
            return "{} {}".format(device, torch.xpu.get_device_name(device))
        else:
            return "{}".format(device.type)
    elif is_intel_xpu():
@ -507,6 +529,8 @@ WINDOWS = any(platform.win32_ver())
 EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
 if WINDOWS:
    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
    if total_vram > (15 * 1024):  # more extra reserved vram on 16GB+ cards
        EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
 if args.reserve_vram is not None:
    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
@ -871,6 +895,7 @@ def vae_dtype(device=None, allowed_dtypes=[]):
            return d
        # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
        # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
        if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
            return d
@ -924,7 +949,7 @@ def device_supports_non_blocking(device):
    if is_device_mps(device):
        return False #pytorch bug? mps doesn't support non blocking
    if is_intel_xpu():
-        return False
+        return True
    if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews)
        return False
    if directml_enabled:
@ -963,6 +988,8 @@ def get_offload_stream(device):
        stream_counter = (stream_counter + 1) % len(ss)
        if is_device_cuda(device):
            ss[stream_counter].wait_stream(torch.cuda.current_stream())
        elif is_device_xpu(device):
            ss[stream_counter].wait_stream(torch.xpu.current_stream())
        stream_counters[device] = stream_counter
        return s
    elif is_device_cuda(device):
@ -974,6 +1001,15 @@ def get_offload_stream(device):
        stream_counter = (stream_counter + 1) % len(ss)
        stream_counters[device] = stream_counter
        return s
    elif is_device_xpu(device):
        ss = []
        for k in range(NUM_STREAMS):
            ss.append(torch.xpu.Stream(device=device, priority=0))
        STREAMS[device] = ss
        s = ss[stream_counter]
        stream_counter = (stream_counter + 1) % len(ss)
        stream_counters[device] = stream_counter
        return s
    return None
 def sync_stream(device, stream):
@ -981,6 +1017,8 @@ def sync_stream(device, stream):
        return
    if is_device_cuda(device):
        torch.cuda.current_stream().wait_stream(stream)
    elif is_device_xpu(device):
        torch.xpu.current_stream().wait_stream(stream)
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
    if device is None or weight.device == device:
@ -1022,6 +1060,8 @@ def xformers_enabled():
        return False
    if is_mlu():
        return False
    if is_ixuca():
        return False
    if directml_enabled:
        return False
    return XFORMERS_IS_AVAILABLE
@ -1047,7 +1087,7 @@ def pytorch_attention_flash_attention():
    global ENABLE_PYTORCH_ATTENTION
    if ENABLE_PYTORCH_ATTENTION:
        #TODO: more reliable way of checking for flash attention?
-        if is_nvidia(): #pytorch flash attention only works on Nvidia
+        if is_nvidia():
            return True
        if is_intel_xpu():
            return True
@ -1057,13 +1097,15 @@ def pytorch_attention_flash_attention():
            return True
        if is_amd():
            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
        if is_ixuca():
            return True
    return False
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
    macos_version = mac_version()
-    if macos_version is not None and ((14, 5) <= macos_version < (16,)):  # black image bug on recent versions of macOS
+    if macos_version is not None and ((14, 5) <= macos_version):  # black image bug on recent versions of macOS, I don't think it's ever getting fixed
        upcast = True
    if upcast:
@ -1087,8 +1129,8 @@ def get_free_memory(dev=None, torch_free_too=False):
            stats = torch.xpu.memory_stats(dev)
            mem_active = stats['active_bytes.all.current']
            mem_reserved = stats['reserved_bytes.all.current']
            mem_free_torch = mem_reserved - mem_active
            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
            mem_free_torch = mem_reserved - mem_active
            mem_free_total = mem_free_xpu + mem_free_torch
        elif is_ascend_npu():
            stats = torch.npu.memory_stats(dev)
@ -1137,6 +1179,9 @@ def is_device_cpu(device):
 def is_device_mps(device):
    return is_device_type(device, 'mps')
 def is_device_xpu(device):
    return is_device_type(device, 'xpu')
 def is_device_cuda(device):
    return is_device_type(device, 'cuda')
@ -1168,7 +1213,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
        return False
    if is_intel_xpu():
        if torch_version_numeric < (2, 3):
            return True
        else:
            return torch.xpu.get_device_properties(device).has_fp16
    if is_ascend_npu():
        return True
@ -1176,6 +1224,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if is_mlu():
        return True
    if is_ixuca():
        return True
    if torch.version.hip:
        return True
@ -1231,11 +1282,17 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        return False
    if is_intel_xpu():
        if torch_version_numeric < (2, 6):
            return True
        else:
            return torch.xpu.get_device_capability(device)['has_bfloat16_conversions']
    if is_ascend_npu():
        return True
    if is_ixuca():
        return True
    if is_amd():
        arch = torch.cuda.get_device_properties(device).gcnArchName
        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
@ -1262,7 +1319,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    return False
 def supports_fp8_compute(device=None):
-    if args.supports_fp8_compute:
+    if SUPPORT_FP8_OPS:
        return True
    if not is_nvidia():
@ -1276,11 +1333,18 @@ def supports_fp8_compute(device=None):
    if props.minor < 9:
        return False
-    if torch_version_numeric[0] < 2 or (torch_version_numeric[0] == 2 and torch_version_numeric[1] < 3):
+    if torch_version_numeric < (2, 3):
        return False
    if WINDOWS:
-        if (torch_version_numeric[0] == 2 and torch_version_numeric[1] < 4):
+        if torch_version_numeric < (2, 4):
            return False
    return True
 def extended_fp16_support():
    # TODO: check why some models work with fp16 on newer torch versions but not on older
    if torch_version_numeric < (2, 7):
        return False
    return True
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -17,23 +17,26 @@
 """
 from __future__ import annotations
-from typing import Optional, Callable
+
-import torch
+import collections
 import copy
 import inspect
 import logging
 import uuid
 import collections
 import math
 import uuid
 from typing import Callable, Optional
 import torch
 import comfy.utils
 import comfy.float
 import comfy.model_management
 import comfy.lora
 import comfy.hooks
 import comfy.lora
 import comfy.model_management
 import comfy.patcher_extension
-from comfy.patcher_extension import CallbacksMP, WrappersMP, PatcherInjection
+import comfy.utils
 from comfy.comfy_types import UnetWrapperFunction
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
 def string_to_seed(data):
    crc = 0xFFFFFFFF
@ -376,6 +379,9 @@ class ModelPatcher:
    def set_model_sampler_pre_cfg_function(self, pre_cfg_function, disable_cfg1_optimization=False):
        self.model_options = set_model_options_pre_cfg_function(self.model_options, pre_cfg_function, disable_cfg1_optimization)
    def set_model_sampler_calc_cond_batch_function(self, sampler_calc_cond_batch_function):
        self.model_options["sampler_calc_cond_batch_function"] = sampler_calc_cond_batch_function
    def set_model_unet_function_wrapper(self, unet_wrapper_function: UnetWrapperFunction):
        self.model_options["model_function_wrapper"] = unet_wrapper_function
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@ -77,6 +77,25 @@ class IMG_TO_IMG(X0):
    def calculate_input(self, sigma, noise):
        return noise
 class COSMOS_RFLOW:
    def calculate_input(self, sigma, noise):
        sigma = (sigma / (sigma + 1))
        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
        return noise * (1.0 - sigma)
    def calculate_denoised(self, sigma, model_output, model_input):
        sigma = (sigma / (sigma + 1))
        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
        return model_input * (1.0 - sigma) - model_output * sigma
    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
        noise = noise * sigma
        noise += latent_image
        return noise
    def inverse_noise_scaling(self, sigma, latent):
        return latent
 class ModelSamplingDiscrete(torch.nn.Module):
    def __init__(self, model_config=None, zsnr=None):
@ -350,3 +369,15 @@ class ModelSamplingFlux(torch.nn.Module):
        if percent >= 1.0:
            return 0.0
        return flux_time_shift(self.shift, 1.0, 1.0 - percent)
 class ModelSamplingCosmosRFlow(ModelSamplingContinuousEDM):
    def timestep(self, sigma):
        return sigma / (sigma + 1)
    def sigma(self, timestep):
        sigma_max = self.sigma_max
        if timestep >= (sigma_max / (sigma_max + 1)):
            return sigma_max
        return timestep / (1 - timestep)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -336,9 +336,12 @@ class fp8_ops(manual_cast):
            return None
        def forward_comfy_cast_weights(self, input):
            try:
                out = fp8_linear(self, input)
                if out is not None:
                    return out
            except Exception as e:
                logging.info("Exception during fp8 op: {}".format(e))
            weight, bias = cast_bias_weight(self, input)
            return torch.nn.functional.linear(input, weight, bias)
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -373,6 +373,10 @@ def sampling_function(model, x, timestep, uncond, cond, cond_scale, model_option
        uncond_ = uncond
    conds = [cond, uncond_]
    if "sampler_calc_cond_batch_function" in model_options:
        args = {"conds": conds, "input": x, "sigma": timestep, "model": model, "model_options": model_options}
        out = model_options["sampler_calc_cond_batch_function"](args)
    else:
        out = calc_cond_batch(model, conds, x, timestep, model_options)
    for fn in model_options.get("sampler_pre_cfg_function", []):
@ -716,7 +720,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"]
+                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"]
 class KSAMPLER(Sampler):
    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
@ -1039,13 +1043,13 @@ class SchedulerHandler(NamedTuple):
    use_ms: bool = True
 SCHEDULER_HANDLERS = {
-    "normal": SchedulerHandler(normal_scheduler),
+    "simple": SchedulerHandler(simple_scheduler),
    "sgm_uniform": SchedulerHandler(partial(normal_scheduler, sgm=True)),
    "karras": SchedulerHandler(k_diffusion_sampling.get_sigmas_karras, use_ms=False),
    "exponential": SchedulerHandler(k_diffusion_sampling.get_sigmas_exponential, use_ms=False),
    "sgm_uniform": SchedulerHandler(partial(normal_scheduler, sgm=True)),
    "simple": SchedulerHandler(simple_scheduler),
    "ddim_uniform": SchedulerHandler(ddim_scheduler),
    "beta": SchedulerHandler(beta_scheduler),
    "normal": SchedulerHandler(normal_scheduler),
    "linear_quadratic": SchedulerHandler(linear_quadratic_schedule),
    "kl_optimal": SchedulerHandler(kl_optimal_scheduler, use_ms=False),
 }
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -14,10 +14,12 @@ import comfy.ldm.genmo.vae.model
 import comfy.ldm.lightricks.vae.causal_video_autoencoder
 import comfy.ldm.cosmos.vae
 import comfy.ldm.wan.vae
 import comfy.ldm.wan.vae2_2
 import comfy.ldm.hunyuan3d.vae
 import comfy.ldm.ace.vae.music_dcae_pipeline
 import yaml
 import math
 import os
 import comfy.utils
@ -44,6 +46,7 @@ import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.hidream
 import comfy.text_encoders.ace
 import comfy.text_encoders.omnigen2
 import comfy.model_patcher
 import comfy.lora
@ -418,6 +421,19 @@ class VAE:
                self.memory_used_encode = lambda shape, dtype: (50 * (round((shape[2] + 7) / 8) * 8) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
                self.working_dtypes = [torch.bfloat16, torch.float32]
            elif "decoder.middle.0.residual.0.gamma" in sd:
                if "decoder.upsamples.0.upsamples.0.residual.2.weight" in sd:  # Wan 2.2 VAE
                    self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
                    self.upscale_index_formula = (4, 16, 16)
                    self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
                    self.downscale_index_formula = (4, 16, 16)
                    self.latent_dim = 3
                    self.latent_channels = 48
                    ddconfig = {"dim": 160, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
                    self.first_stage_model = comfy.ldm.wan.vae2_2.WanVAE(**ddconfig)
                    self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                    self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
                    self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
                else:  # Wan 2.1 VAE
                    self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
                    self.upscale_index_formula = (4, 8, 8)
                    self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
@ -754,6 +770,7 @@ class CLIPType(Enum):
    HIDREAM = 14
    CHROMA = 15
    ACE = 16
    OMNIGEN2 = 17
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@ -773,6 +790,7 @@ class TEModel(Enum):
    LLAMA3_8 = 7
    T5_XXL_OLD = 8
    GEMMA_2_2B = 9
    QWEN25_3B = 10
 def detect_te_model(sd):
    if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@ -793,6 +811,8 @@ def detect_te_model(sd):
        return TEModel.T5_BASE
    if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
        return TEModel.GEMMA_2_2B
    if 'model.layers.0.self_attn.k_proj.bias' in sd:
        return TEModel.QWEN25_3B
    if "model.layers.0.post_attention_layernorm.weight" in sd:
        return TEModel.LLAMA3_8
    return None
@ -894,6 +914,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
                                                                        clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
            clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
        elif te_model == TEModel.QWEN25_3B:
            clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
@ -969,6 +992,12 @@ def load_gligen(ckpt_path):
        model = model.half()
    return comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device())
 def model_detection_error_hint(path, state_dict):
    filename = os.path.basename(path)
    if 'lora' in filename.lower():
        return "\nHINT: This seems to be a Lora file and Lora files should be put in the lora folder and loaded with a lora loader node.."
    return ""
 def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_clip=True, embedding_directory=None, state_dict=None, config=None):
    logging.warning("Warning: The load checkpoint with config function is deprecated and will eventually be removed, please use the other one.")
    model, clip, vae, _ = load_checkpoint_guess_config(ckpt_path, output_vae=output_vae, output_clip=output_clip, output_clipvision=False, embedding_directory=embedding_directory, output_model=True)
@ -997,7 +1026,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
    if out is None:
-        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
+        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
    return out
 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
@ -1081,7 +1110,28 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    return (model_patcher, clip, vae, clipvision)
-def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffusers or regular format
+def load_diffusion_model_state_dict(sd, model_options={}):
    """
    Loads a UNet diffusion model from a state dictionary, supporting both diffusers and regular formats.
    Args:
        sd (dict): State dictionary containing model weights and configuration
        model_options (dict, optional): Additional options for model loading. Supports:
            - dtype: Override model data type
            - custom_operations: Custom model operations
            - fp8_optimizations: Enable FP8 optimizations
    Returns:
        ModelPatcher: A wrapped model instance that handles device management and weight loading.
        Returns None if the model configuration cannot be detected.
    The function:
    1. Detects and handles different model formats (regular, diffusers, mmdit)
    2. Configures model dtype based on parameters and device capabilities
    3. Handles weight conversion and device placement
    4. Manages model optimization settings
    5. Loads weights and returns a device-managed model instance
    """
    dtype = model_options.get("dtype", None)
    #Allow loading unets from checkpoint files
@ -1139,7 +1189,7 @@ def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffuse
    model.load_model_weights(new_sd, "")
    left_over = sd.keys()
    if len(left_over) > 0:
-        logging.info("left over keys in unet: {}".format(left_over))
+        logging.info("left over keys in diffusion model: {}".format(left_over))
    return comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device)
@ -1147,8 +1197,8 @@ def load_diffusion_model(unet_path, model_options={}):
    sd = comfy.utils.load_torch_file(unet_path)
    model = load_diffusion_model_state_dict(sd, model_options=model_options)
    if model is None:
-        logging.error("ERROR UNSUPPORTED UNET {}".format(unet_path))
+        logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
-        raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
+        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
    return model
 def load_unet(unet_path, dtype=None):
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@ -462,7 +462,7 @@ class SDTokenizer:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
        self.max_length = tokenizer_data.get("{}_max_length".format(embedding_key), max_length)
-        self.min_length = min_length
+        self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length)
        self.end_token = None
        self.min_padding = min_padding
@ -482,6 +482,7 @@ class SDTokenizer:
            if end_token is not None:
                self.end_token = end_token
            else:
                if has_end_token:
                    self.end_token = empty[0]
        if pad_token is not None:
--- a/comfy/sd1_tokenizer/tokenizer_config.json
+++ b/comfy/sd1_tokenizer/tokenizer_config.json
@ -18,7 +18,7 @@
    "single_word": false
  },
  "errors": "replace",
-  "model_max_length": 77,
+  "model_max_length": 8192,
  "name_or_path": "openai/clip-vit-large-patch14",
  "pad_token": "<|endoftext|>",
  "special_tokens_map_file": "./special_tokens_map.json",
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -18,6 +18,7 @@ import comfy.text_encoders.cosmos
 import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.ace
 import comfy.text_encoders.omnigen2
 from . import supported_models_base
 from . import latent_formats
@ -908,6 +909,48 @@ class CosmosI2V(CosmosT2V):
        out = model_base.CosmosVideo(self, image_to_video=True, device=device)
        return out
 class CosmosT2IPredict2(supported_models_base.BASE):
    unet_config = {
        "image_model": "cosmos_predict2",
        "in_channels": 16,
    }
    sampling_settings = {
        "sigma_data": 1.0,
        "sigma_max": 80.0,
        "sigma_min": 0.002,
    }
    unet_extra_config = {}
    latent_format = latent_formats.Wan21
    memory_usage_factor = 1.0
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    def __init__(self, unet_config):
        super().__init__(unet_config)
        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.CosmosPredict2(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
 class CosmosI2VPredict2(CosmosT2IPredict2):
    unet_config = {
        "image_model": "cosmos_predict2",
        "in_channels": 17,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.CosmosPredict2(self, image_to_video=True, device=device)
        return out
 class Lumina2(supported_models_base.BASE):
    unet_config = {
        "image_model": "lumina2",
@ -1016,6 +1059,19 @@ class WAN21_Vace(WAN21_T2V):
        out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
        return out
 class WAN22_T2V(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "t2v",
        "out_dim": 48,
    }
    latent_format = latent_formats.Wan22
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN22(self, image_to_video=True, device=device)
        return out
 class Hunyuan3Dv2(supported_models_base.BASE):
    unet_config = {
        "image_model": "hunyuan3d2",
@ -1139,6 +1195,41 @@ class ACEStep(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep]
+class Omnigen2(supported_models_base.BASE):
    unet_config = {
        "image_model": "omnigen2",
    }
    sampling_settings = {
        "multiplier": 1.0,
        "shift": 2.6,
    }
    memory_usage_factor = 1.65 #TODO
    unet_extra_config = {}
    latent_format = latent_formats.Flux
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def __init__(self, unet_config):
        super().__init__(unet_config)
        if comfy.model_management.extended_fp16_support():
            self.supported_inference_dtypes = [torch.float16] + self.supported_inference_dtypes
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Omnigen2(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -24,6 +24,24 @@ class Llama2Config:
    head_dim = 128
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = False
@dataclass
 class Qwen25_3BConfig:
    vocab_size: int = 151936
    hidden_size: int = 2048
    intermediate_size: int = 11008
    num_hidden_layers: int = 36
    num_attention_heads: int = 16
    num_key_value_heads: int = 2
    max_position_embeddings: int = 128000
    rms_norm_eps: float = 1e-6
    rope_theta: float = 1000000.0
    transformer_type: str = "llama"
    head_dim = 128
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = True
@dataclass
 class Gemma2_2B_Config:
@ -40,6 +58,7 @@ class Gemma2_2B_Config:
    head_dim = 256
    rms_norm_add = True
    mlp_activation = "gelu_pytorch_tanh"
    qkv_bias = False
 class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@ -98,9 +117,9 @@ class Attention(nn.Module):
        self.inner_size = self.num_heads * self.head_dim
        ops = ops or nn
-        self.q_proj = ops.Linear(config.hidden_size, self.inner_size, bias=False, device=device, dtype=dtype)
+        self.q_proj = ops.Linear(config.hidden_size, self.inner_size, bias=config.qkv_bias, device=device, dtype=dtype)
-        self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False, device=device, dtype=dtype)
+        self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
-        self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False, device=device, dtype=dtype)
+        self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
        self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
    def forward(
@ -320,6 +339,14 @@ class Llama2(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Qwen25_3B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        config = Qwen25_3BConfig(**config_dict)
        self.num_layers = config.num_hidden_layers
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Gemma2_2B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
--- a/comfy/text_encoders/omnigen2.py
+++ b/comfy/text_encoders/omnigen2.py
@ -0,0 +1,44 @@
 from transformers import Qwen2Tokenizer
 from comfy import sd1_clip
 import comfy.text_encoders.llama
 import os
 class Qwen25_3BTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen25_3b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
 class Omnigen2Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_3b", tokenizer=Qwen25_3BTokenizer)
        self.llama_template = '<|im_start|>system\nYou are a helpful assistant that generates high-quality images based on user instructions.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n'
    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs):
        if llama_template is None:
            llama_text = self.llama_template.format(text)
        else:
            llama_text = llama_template.format(text)
        return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
 class Qwen25_3BModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 class Omnigen2Model(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__(device=device, dtype=dtype, name="qwen25_3b", clip_model=Qwen25_3BModel, model_options=model_options)
 def te(dtype_llama=None, llama_scaled_fp8=None):
    class Omnigen2TEModel_(Omnigen2Model):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["scaled_fp8"] = llama_scaled_fp8
            if dtype_llama is not None:
                dtype = dtype_llama
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return Omnigen2TEModel_
--- a/comfy/text_encoders/qwen25_tokenizer/merges.txt
+++ b/comfy/text_encoders/qwen25_tokenizer/merges.txt
--- a/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json
@ -0,0 +1,241 @@
 {
  "add_bos_token": false,
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151665": {
      "content": "<|img|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151666": {
      "content": "<|endofimg|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151667": {
      "content": "<|meta|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151668": {
      "content": "<|endofmeta|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "model_max_length": 131072,
  "pad_token": "<|endoftext|>",
  "processor_class": "Qwen2_5_VLProcessor",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/comfy/text_encoders/qwen25_tokenizer/vocab.json
+++ b/comfy/text_encoders/qwen25_tokenizer/vocab.json
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@ -146,7 +146,7 @@ class T5Attention(torch.nn.Module):
        )
        values = self.relative_attention_bias(relative_position_bucket, out_dtype=dtype)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
+        return values.contiguous()
    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
        q = self.q(x)
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -31,6 +31,7 @@ from einops import rearrange
 from comfy.cli_args import args
 MMAP_TORCH_FILES = args.mmap_torch_files
 DISABLE_MMAP = args.disable_mmap
 ALWAYS_SAFE_LOAD = False
 if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in pytorch 2.4, the unsafe path should be removed once earlier versions are deprecated
@ -64,7 +65,10 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
            with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
                sd = {}
                for k in f.keys():
-                    sd[k] = f.get_tensor(k)
+                    tensor = f.get_tensor(k)
                    if DISABLE_MMAP:  # TODO: Not sure if this is the best way to bypass the mmap issues
                        tensor = tensor.to(device=device, copy=True)
                    sd[k] = tensor
                if return_metadata:
                    metadata = f.metadata()
        except Exception as e:
@ -85,6 +89,7 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
        if safe_load or ALWAYS_SAFE_LOAD:
            pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args)
        else:
            logging.warning("WARNING: loading {} unsafely, upgrade your pytorch to 2.4 or newer to load this file safely.".format(ckpt))
            pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle)
        if "state_dict" in pl_sd:
            sd = pl_sd["state_dict"]
@ -708,6 +713,26 @@ def resize_to_batch_size(tensor, batch_size):
    return output
 def resize_list_to_batch_size(l, batch_size):
    in_batch_size = len(l)
    if in_batch_size == batch_size or in_batch_size == 0:
        return l
    if batch_size <= 1:
        return l[:batch_size]
    output = []
    if batch_size < in_batch_size:
        scale = (in_batch_size - 1) / (batch_size - 1)
        for i in range(batch_size):
            output.append(l[min(round(i * scale), in_batch_size - 1)])
    else:
        scale = in_batch_size / batch_size
        for i in range(batch_size):
           output.append(l[min(math.floor((i + 0.5) * scale), in_batch_size - 1)])
    return output
 def convert_sd_to(state_dict, dtype):
    keys = list(state_dict.keys())
    for k in keys:
@ -1012,11 +1037,12 @@ def set_progress_bar_global_hook(function):
    PROGRESS_BAR_HOOK = function
 class ProgressBar:
-    def __init__(self, total):
+    def __init__(self, total, node_id=None):
        global PROGRESS_BAR_HOOK
        self.total = total
        self.current = 0
        self.hook = PROGRESS_BAR_HOOK
        self.node_id = node_id
    def update_absolute(self, value, total=None, preview=None):
        if total is not None:
@ -1025,7 +1051,7 @@ class ProgressBar:
            value = self.total
        self.current = value
        if self.hook is not None:
-            self.hook(self.current, self.total, preview)
+            self.hook(self.current, self.total, preview, node_id=self.node_id)
    def update(self, value):
        self.update_absolute(self.current + value)
--- a/comfy/weight_adapter/init.py
+++ b/comfy/weight_adapter/init.py
@ -1,4 +1,4 @@
-from .base import WeightAdapterBase
+from .base import WeightAdapterBase, WeightAdapterTrainBase
 from .lora import LoRAAdapter
 from .loha import LoHaAdapter
 from .lokr import LoKrAdapter
@ -15,3 +15,20 @@ adapters: list[type[WeightAdapterBase]] = [
    OFTAdapter,
    BOFTAdapter,
 ]
 adapter_maps: dict[str, type[WeightAdapterBase]] = {
    "LoRA": LoRAAdapter,
    "LoHa": LoHaAdapter,
    "LoKr": LoKrAdapter,
    "OFT": OFTAdapter,
    ## We disable not implemented algo for now
    # "GLoRA": GLoRAAdapter,
    # "BOFT": BOFTAdapter,
 }
 __all__ = [
    "WeightAdapterBase",
    "WeightAdapterTrainBase",
    "adapters",
    "adapter_maps",
 ] + [a.__name__ for a in adapters]
--- a/comfy/weight_adapter/base.py
+++ b/comfy/weight_adapter/base.py
@ -12,12 +12,20 @@ class WeightAdapterBase:
    weights: list[torch.Tensor]
    @classmethod
-    def load(cls, x: str, lora: dict[str, torch.Tensor]) -> Optional["WeightAdapterBase"]:
+    def load(cls, x: str, lora: dict[str, torch.Tensor], alpha: float, dora_scale: torch.Tensor) -> Optional["WeightAdapterBase"]:
        raise NotImplementedError
    def to_train(self) -> "WeightAdapterTrainBase":
        raise NotImplementedError
    @classmethod
    def create_train(cls, weight, *args) -> "WeightAdapterTrainBase":
        """
        weight: The original weight tensor to be modified.
        *args: Additional arguments for configuration, such as rank, alpha etc.
        """
        raise NotImplementedError
    def calculate_weight(
        self,
        weight,
@ -33,10 +41,22 @@ class WeightAdapterBase:
 class WeightAdapterTrainBase(nn.Module):
    # We follow the scheme of PR #7032
    def __init__(self):
        super().__init__()
-    # [TODO] Collaborate with LoRA training PR #7032
+    def __call__(self, w):
        """
        w: The original weight tensor to be modified.
        """
        raise NotImplementedError
    def passive_memory_usage(self):
        raise NotImplementedError("passive_memory_usage is not implemented")
    def move_to(self, device):
        self.to(device)
        return self.passive_memory_usage()
 def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
@ -102,3 +122,54 @@ def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Ten
    padded_tensor[new_slices] = tensor[orig_slices]
    return padded_tensor
 def tucker_weight_from_conv(up, down, mid):
    up = up.reshape(up.size(0), up.size(1))
    down = down.reshape(down.size(0), down.size(1))
    return torch.einsum("m n ..., i m, n j -> i j ...", mid, up, down)
 def tucker_weight(wa, wb, t):
    temp = torch.einsum("i j ..., j r -> i r ...", t, wb)
    return torch.einsum("i j ..., i r -> r j ...", temp, wa)
 def factorization(dimension: int, factor: int = -1) -> tuple[int, int]:
    """
    return a tuple of two value of input dimension decomposed by the number closest to factor
    second value is higher or equal than first value.
    examples)
    factor
        -1               2                4               8               16               ...
    127 -> 1, 127   127 -> 1, 127    127 -> 1, 127   127 -> 1, 127   127 -> 1, 127
    128 -> 8, 16    128 -> 2, 64     128 -> 4, 32    128 -> 8, 16    128 -> 8, 16
    250 -> 10, 25   250 -> 2, 125    250 -> 2, 125   250 -> 5, 50    250 -> 10, 25
    360 -> 8, 45    360 -> 2, 180    360 -> 4, 90    360 -> 8, 45    360 -> 12, 30
    512 -> 16, 32   512 -> 2, 256    512 -> 4, 128   512 -> 8, 64    512 -> 16, 32
    1024 -> 32, 32  1024 -> 2, 512   1024 -> 4, 256  1024 -> 8, 128  1024 -> 16, 64
    """
    if factor > 0 and (dimension % factor) == 0 and dimension >= factor**2:
        m = factor
        n = dimension // factor
        if m > n:
            n, m = m, n
        return m, n
    if factor < 0:
        factor = dimension
    m, n = 1, dimension
    length = m + n
    while m < n:
        new_m = m + 1
        while dimension % new_m != 0:
            new_m += 1
        new_n = dimension // new_m
        if new_m + new_n > length or new_m > factor:
            break
        else:
            m, n = new_m, new_n
    if m > n:
        n, m = m, n
    return m, n
--- a/comfy/weight_adapter/loha.py
+++ b/comfy/weight_adapter/loha.py
@ -3,7 +3,120 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose
 class HadaWeight(torch.autograd.Function):
    @staticmethod
    def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)):
        ctx.save_for_backward(w1d, w1u, w2d, w2u, scale)
        diff_weight = ((w1u @ w1d) * (w2u @ w2d)) * scale
        return diff_weight
    @staticmethod
    def backward(ctx, grad_out):
        (w1d, w1u, w2d, w2u, scale) = ctx.saved_tensors
        grad_out = grad_out * scale
        temp = grad_out * (w2u @ w2d)
        grad_w1u = temp @ w1d.T
        grad_w1d = w1u.T @ temp
        temp = grad_out * (w1u @ w1d)
        grad_w2u = temp @ w2d.T
        grad_w2d = w2u.T @ temp
        del temp
        return grad_w1u, grad_w1d, grad_w2u, grad_w2d, None
 class HadaWeightTucker(torch.autograd.Function):
    @staticmethod
    def forward(ctx, t1, w1u, w1d, t2, w2u, w2d, scale=torch.tensor(1)):
        ctx.save_for_backward(t1, w1d, w1u, t2, w2d, w2u, scale)
        rebuild1 = torch.einsum("i j ..., j r, i p -> p r ...", t1, w1d, w1u)
        rebuild2 = torch.einsum("i j ..., j r, i p -> p r ...", t2, w2d, w2u)
        return rebuild1 * rebuild2 * scale
    @staticmethod
    def backward(ctx, grad_out):
        (t1, w1d, w1u, t2, w2d, w2u, scale) = ctx.saved_tensors
        grad_out = grad_out * scale
        temp = torch.einsum("i j ..., j r -> i r ...", t2, w2d)
        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w2u)
        grad_w = rebuild * grad_out
        del rebuild
        grad_w1u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w1u.T)
        del grad_w, temp
        grad_w1d = torch.einsum("i r ..., i j ... -> r j", t1, grad_temp)
        grad_t1 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w1d.T)
        del grad_temp
        temp = torch.einsum("i j ..., j r -> i r ...", t1, w1d)
        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w1u)
        grad_w = rebuild * grad_out
        del rebuild
        grad_w2u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w2u.T)
        del grad_w, temp
        grad_w2d = torch.einsum("i r ..., i j ... -> r j", t2, grad_temp)
        grad_t2 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w2d.T)
        del grad_temp
        return grad_t1, grad_w1u, grad_w1d, grad_t2, grad_w2u, grad_w2d, None
 class LohaDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        # Unpack weights tuple from LoHaAdapter
        w1a, w1b, alpha, w2a, w2b, t1, t2, _ = weights
        # Create trainable parameters
        self.hada_w1_a = torch.nn.Parameter(w1a)
        self.hada_w1_b = torch.nn.Parameter(w1b)
        self.hada_w2_a = torch.nn.Parameter(w2a)
        self.hada_w2_b = torch.nn.Parameter(w2b)
        self.use_tucker = False
        if t1 is not None and t2 is not None:
            self.use_tucker = True
            self.hada_t1 = torch.nn.Parameter(t1)
            self.hada_t2 = torch.nn.Parameter(t2)
        else:
            # Keep the attributes for consistent access
            self.hada_t1 = None
            self.hada_t2 = None
        # Store rank and non-trainable alpha
        self.rank = w1b.shape[0]
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    def __call__(self, w):
        org_dtype = w.dtype
        scale = self.alpha / self.rank
        if self.use_tucker:
            diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale)
        else:
            diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale)
        # Add the scaled difference to the original weight
        weight = w.to(diff_weight) + diff_weight.reshape(w.shape)
        return weight.to(org_dtype)
    def passive_memory_usage(self):
        """Calculates memory usage of the trainable parameters."""
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class LoHaAdapter(WeightAdapterBase):
@ -13,6 +126,25 @@ class LoHaAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        in_dim = weight.shape[1:].numel()
        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
        torch.nn.init.normal_(mat1, 0.1)
        torch.nn.init.constant_(mat2, 0.0)
        mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
        torch.nn.init.normal_(mat3, 0.1)
        torch.nn.init.normal_(mat4, 0.01)
        return LohaDiff(
            (mat1, mat2, alpha, mat3, mat4, None, None, None)
        )
    def to_train(self):
        return LohaDiff(self.weights)
    @classmethod
    def load(
        cls,
--- a/comfy/weight_adapter/lokr.py
+++ b/comfy/weight_adapter/lokr.py
@ -3,7 +3,77 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import (
    WeightAdapterBase,
    WeightAdapterTrainBase,
    weight_decompose,
    factorization,
 )
 class LokrDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights
        self.use_tucker = False
        if lokr_w1_a is not None:
            _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1]
            rank_a, _ = lokr_w1_b.shape[0], lokr_w1_b.shape[1]
            self.lokr_w1_a = torch.nn.Parameter(lokr_w1_a)
            self.lokr_w1_b = torch.nn.Parameter(lokr_w1_b)
            self.w1_rebuild = True
            self.ranka = rank_a
        if lokr_w2_a is not None:
            _, rank_b = lokr_w2_a.shape[0], lokr_w2_a.shape[1]
            rank_b, _ = lokr_w2_b.shape[0], lokr_w2_b.shape[1]
            self.lokr_w2_a = torch.nn.Parameter(lokr_w2_a)
            self.lokr_w2_b = torch.nn.Parameter(lokr_w2_b)
            if lokr_t2 is not None:
                self.use_tucker = True
                self.lokr_t2 = torch.nn.Parameter(lokr_t2)
            self.w2_rebuild = True
            self.rankb = rank_b
        if lokr_w1 is not None:
            self.lokr_w1 = torch.nn.Parameter(lokr_w1)
            self.w1_rebuild = False
        if lokr_w2 is not None:
            self.lokr_w2 = torch.nn.Parameter(lokr_w2)
            self.w2_rebuild = False
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    @property
    def w1(self):
        if self.w1_rebuild:
            return (self.lokr_w1_a @ self.lokr_w1_b) * (self.alpha / self.ranka)
        else:
            return self.lokr_w1
    @property
    def w2(self):
        if self.w2_rebuild:
            if self.use_tucker:
                w2 = torch.einsum(
                    'i j k l, j r, i p -> p r k l',
                    self.lokr_t2,
                    self.lokr_w2_b,
                    self.lokr_w2_a
                )
            else:
                w2 = self.lokr_w2_a @ self.lokr_w2_b
            return w2 * (self.alpha / self.rankb)
        else:
            return self.lokr_w2
    def __call__(self, w):
        diff = torch.kron(self.w1, self.w2)
        return w + diff.reshape(w.shape).to(w)
    def passive_memory_usage(self):
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class LoKrAdapter(WeightAdapterBase):
@ -13,6 +83,20 @@ class LoKrAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        in_dim = weight.shape[1:].numel()
        out1, out2 = factorization(out_dim, rank)
        in1, in2 = factorization(in_dim, rank)
        mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype)
        mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype)
        torch.nn.init.kaiming_uniform_(mat2, a=5**0.5)
        torch.nn.init.constant_(mat1, 0.0)
        return LokrDiff(
            (mat1, mat2, alpha, None, None, None, None, None, None)
        )
    @classmethod
    def load(
        cls,
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@ -3,7 +3,56 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose, pad_tensor_to_shape
+from .base import (
    WeightAdapterBase,
    WeightAdapterTrainBase,
    weight_decompose,
    pad_tensor_to_shape,
    tucker_weight_from_conv,
 )
 class LoraDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        mat1, mat2, alpha, mid, dora_scale, reshape = weights
        out_dim, rank = mat1.shape[0], mat1.shape[1]
        rank, in_dim = mat2.shape[0], mat2.shape[1]
        if mid is not None:
            convdim = mid.ndim - 2
            layer = (
                torch.nn.Conv1d,
                torch.nn.Conv2d,
                torch.nn.Conv3d
            )[convdim]
        else:
            layer = torch.nn.Linear
        self.lora_up = layer(rank, out_dim, bias=False)
        self.lora_down = layer(in_dim, rank, bias=False)
        self.lora_up.weight.data.copy_(mat1)
        self.lora_down.weight.data.copy_(mat2)
        if mid is not None:
            self.lora_mid = layer(mid, rank, bias=False)
            self.lora_mid.weight.data.copy_(mid)
        else:
            self.lora_mid = None
        self.rank = rank
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    def __call__(self, w):
        org_dtype = w.dtype
        if self.lora_mid is None:
            diff = self.lora_up.weight @ self.lora_down.weight
        else:
            diff = tucker_weight_from_conv(
                self.lora_up.weight, self.lora_down.weight, self.lora_mid.weight
            )
        scale = self.alpha / self.rank
        weight = w + scale * diff.reshape(w.shape)
        return weight.to(org_dtype)
    def passive_memory_usage(self):
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class LoRAAdapter(WeightAdapterBase):
@ -13,6 +62,21 @@ class LoRAAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        in_dim = weight.shape[1:].numel()
        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
        torch.nn.init.kaiming_uniform_(mat1, a=5**0.5)
        torch.nn.init.constant_(mat2, 0.0)
        return LoraDiff(
            (mat1, mat2, alpha, None, None, None)
        )
    def to_train(self):
        return LoraDiff(self.weights)
    @classmethod
    def load(
        cls,
--- a/comfy/weight_adapter/oft.py
+++ b/comfy/weight_adapter/oft.py
@ -3,7 +3,58 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization
 class OFTDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        # Unpack weights tuple from LoHaAdapter
        blocks, rescale, alpha, _ = weights
        # Create trainable parameters
        self.oft_blocks = torch.nn.Parameter(blocks)
        if rescale is not None:
            self.rescale = torch.nn.Parameter(rescale)
            self.rescaled = True
        else:
            self.rescaled = False
        self.block_num, self.block_size, _ = blocks.shape
        self.constraint = float(alpha)
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    def __call__(self, w):
        org_dtype = w.dtype
        I = torch.eye(self.block_size, device=self.oft_blocks.device)
        ## generate r
        # for Q = -Q^T
        q = self.oft_blocks - self.oft_blocks.transpose(1, 2)
        normed_q = q
        if self.constraint:
            q_norm = torch.norm(q) + 1e-8
            if q_norm > self.constraint:
                normed_q = q * self.constraint / q_norm
        # use float() to prevent unsupported type
        r = (I + normed_q) @ (I - normed_q).float().inverse()
        ## Apply chunked matmul on weight
        _, *shape = w.shape
        org_weight = w.to(dtype=r.dtype)
        org_weight = org_weight.unflatten(0, (self.block_num, self.block_size))
        # Init R=0, so add I on it to ensure the output of step0 is original model output
        weight = torch.einsum(
            "k n m, k n ... -> k m ...",
            r,
            org_weight,
        ).flatten(0, 1)
        if self.rescaled:
            weight = self.rescale * weight
        return weight.to(org_dtype)
    def passive_memory_usage(self):
        """Calculates memory usage of the trainable parameters."""
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class OFTAdapter(WeightAdapterBase):
@ -13,6 +64,18 @@ class OFTAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        block_size, block_num = factorization(out_dim, rank)
        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype)
        return OFTDiff(
            (block, None, alpha, None)
        )
    def to_train(self):
        return OFTDiff(self.weights)
    @classmethod
    def load(
        cls,
@ -60,6 +123,8 @@ class OFTAdapter(WeightAdapterBase):
        blocks = v[0]
        rescale = v[1]
        alpha = v[2]
        if alpha is None:
            alpha = 0
        dora_scale = v[3]
        blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype)
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@ -0,0 +1,69 @@
 """
 Feature flags module for ComfyUI WebSocket protocol negotiation.
 This module handles capability negotiation between frontend and backend,
 allowing graceful protocol evolution while maintaining backward compatibility.
 """
 from typing import Any, Dict
 from comfy.cli_args import args
 # Default server capabilities
 SERVER_FEATURE_FLAGS: Dict[str, Any] = {
    "supports_preview_metadata": True,
    "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
 }
 def get_connection_feature(
    sockets_metadata: Dict[str, Dict[str, Any]],
    sid: str,
    feature_name: str,
    default: Any = False
 ) -> Any:
    """
    Get a feature flag value for a specific connection.
    Args:
        sockets_metadata: Dictionary of socket metadata
        sid: Session ID of the connection
        feature_name: Name of the feature to check
        default: Default value if feature not found
    Returns:
        Feature value or default if not found
    """
    if sid not in sockets_metadata:
        return default
    return sockets_metadata[sid].get("feature_flags", {}).get(feature_name, default)
 def supports_feature(
    sockets_metadata: Dict[str, Dict[str, Any]],
    sid: str,
    feature_name: str
 ) -> bool:
    """
    Check if a connection supports a specific feature.
    Args:
        sockets_metadata: Dictionary of socket metadata
        sid: Session ID of the connection
        feature_name: Name of the feature to check
    Returns:
        Boolean indicating if feature is supported
    """
    return get_connection_feature(sockets_metadata, sid, feature_name, False) is True
 def get_server_features() -> Dict[str, Any]:
    """
    Get the server's feature flags.
    Returns:
        Dictionary of server feature flags
    """
    return SERVER_FEATURE_FLAGS.copy()
--- a/comfy_api/generate_api_stubs.py
+++ b/comfy_api/generate_api_stubs.py
@ -0,0 +1,86 @@
 #!/usr/bin/env python3
 """
 Script to generate .pyi stub files for the synchronous API wrappers.
 This allows generating stubs without running the full ComfyUI application.
 """
 import os
 import sys
 import logging
 import importlib
 # Add ComfyUI to path so we can import modules
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from comfy_api.internal.async_to_sync import AsyncToSyncConverter
 from comfy_api.version_list import supported_versions
 def generate_stubs_for_module(module_name: str) -> None:
    """Generate stub files for a specific module that exports ComfyAPI and ComfyAPISync."""
    try:
        # Import the module
        module = importlib.import_module(module_name)
        # Check if module has ComfyAPISync (the sync wrapper)
        if hasattr(module, "ComfyAPISync"):
            # Module already has a sync class
            api_class = getattr(module, "ComfyAPI", None)
            sync_class = getattr(module, "ComfyAPISync")
            if api_class:
                # Generate the stub file
                AsyncToSyncConverter.generate_stub_file(api_class, sync_class)
                logging.info(f"Generated stub file for {module_name}")
            else:
                logging.warning(
                    f"Module {module_name} has ComfyAPISync but no ComfyAPI"
                )
        elif hasattr(module, "ComfyAPI"):
            # Module only has async API, need to create sync wrapper first
            from comfy_api.internal.async_to_sync import create_sync_class
            api_class = getattr(module, "ComfyAPI")
            sync_class = create_sync_class(api_class)
            # Generate the stub file
            AsyncToSyncConverter.generate_stub_file(api_class, sync_class)
            logging.info(f"Generated stub file for {module_name}")
        else:
            logging.warning(
                f"Module {module_name} does not export ComfyAPI or ComfyAPISync"
            )
    except Exception as e:
        logging.error(f"Failed to generate stub for {module_name}: {e}")
        import traceback
        traceback.print_exc()
 def main():
    """Main function to generate all API stub files."""
    logging.basicConfig(level=logging.INFO)
    logging.info("Starting stub generation...")
    # Dynamically get module names from supported_versions
    api_modules = []
    for api_class in supported_versions:
        # Extract module name from the class
        module_name = api_class.__module__
        if module_name not in api_modules:
            api_modules.append(module_name)
    logging.info(f"Found {len(api_modules)} API modules: {api_modules}")
    # Generate stubs for each module
    for module_name in api_modules:
        generate_stubs_for_module(module_name)
    logging.info("Stub generation complete!")
 if __name__ == "__main__":
    main()
--- a/comfy_api/input/init.py
+++ b/comfy_api/input/init.py
@ -1,8 +1,16 @@
-from .basic_types import ImageInput, AudioInput
+# This file only exists for backwards compatibility.
-from .video_types import VideoInput
+from comfy_api.latest._input import (
    ImageInput,
    AudioInput,
    MaskInput,
    LatentInput,
    VideoInput,
 )
 __all__ = [
    "ImageInput",
    "AudioInput",
    "MaskInput",
    "LatentInput",
    "VideoInput",
 ]
--- a/comfy_api/input/basic_types.py
+++ b/comfy_api/input/basic_types.py
@ -1,20 +1,14 @@
-import torch
+# This file only exists for backwards compatibility.
-from typing import TypedDict
+from comfy_api.latest._input.basic_types import (
-
+    ImageInput,
-ImageInput = torch.Tensor
+    AudioInput,
-"""
+    MaskInput,
-An image in format [B, H, W, C] where B is the batch size, C is the number of channels,
+    LatentInput,
-"""
+)
 class AudioInput(TypedDict):
    """
    TypedDict representing audio input.
    """
    waveform: torch.Tensor
    """
    Tensor in the format [B, C, T] where B is the batch size, C is the number of channels,
    """
    sample_rate: int
 __all__ = [
    "ImageInput",
    "AudioInput",
    "MaskInput",
    "LatentInput",
 ]
--- a/comfy_api/input/video_types.py
+++ b/comfy_api/input/video_types.py
@ -1,55 +1,6 @@
-from __future__ import annotations
+# This file only exists for backwards compatibility.
-from abc import ABC, abstractmethod
+from comfy_api.latest._input.video_types import VideoInput
 from typing import Optional
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
-class VideoInput(ABC):
+__all__ = [
-    """
+    "VideoInput",
-    Abstract base class for video input types.
+]
    """
    @abstractmethod
    def get_components(self) -> VideoComponents:
        """
        Abstract method to get the video components (images, audio, and frame rate).
        Returns:
            VideoComponents containing images, audio, and frame rate
        """
        pass
    @abstractmethod
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        """
        Abstract method to save the video input to a file.
        """
        pass
    # Provide a default implementation, but subclasses can provide optimized versions
    # if possible.
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        components = self.get_components()
        return components.images.shape[2], components.images.shape[1]
    def get_duration(self) -> float:
        """
        Returns the duration of the video in seconds.
        Returns:
            Duration in seconds
        """
        components = self.get_components()
        frame_count = components.images.shape[0]
        return float(frame_count / components.frame_rate)
--- a/comfy_api/input_impl/init.py
+++ b/comfy_api/input_impl/init.py
@ -1,7 +1,7 @@
-from .video_types import VideoFromFile, VideoFromComponents
+# This file only exists for backwards compatibility.
 from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
 __all__ = [
    # Implementations
    "VideoFromFile",
    "VideoFromComponents",
 ]
--- a/comfy_api/input_impl/video_types.py
+++ b/comfy_api/input_impl/video_types.py
@ -1,303 +1,2 @@
-from __future__ import annotations
+# This file only exists for backwards compatibility.
-from av.container import InputContainer
+from comfy_api.latest._input_impl.video_types import *  # noqa: F403
 from av.subtitles.stream import SubtitleStream
 from fractions import Fraction
 from typing import Optional
 from comfy_api.input import AudioInput
 import av
 import io
 import json
 import numpy as np
 import torch
 from comfy_api.input import VideoInput
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
 def container_to_output_format(container_format: str | None) -> str | None:
    """
    A container's `format` may be a comma-separated list of formats.
    E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`.
    However, writing to a file/stream with `av.open` requires a single format,
    or `None` to auto-detect.
    """
    if not container_format:
        return None  # Auto-detect
    if "," not in container_format:
        return container_format
    formats = container_format.split(",")
    return formats[0]
 def get_open_write_kwargs(
    dest: str | io.BytesIO, container_format: str, to_format: str | None
 ) -> dict:
    """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`"""
    open_kwargs = {
        "mode": "w",
        # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo)
        "options": {"movflags": "use_metadata_tags"},
    }
    is_write_to_buffer = isinstance(dest, io.BytesIO)
    if is_write_to_buffer:
        # Set output format explicitly, since it cannot be inferred from file extension
        if to_format == VideoContainer.AUTO:
            to_format = container_format.lower()
        elif isinstance(to_format, str):
            to_format = to_format.lower()
        open_kwargs["format"] = container_to_output_format(to_format)
    return open_kwargs
 class VideoFromFile(VideoInput):
    """
    Class representing video input from a file.
    """
    def __init__(self, file: str | io.BytesIO):
        """
        Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
        containing the file contents.
        """
        self.__file = file
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            for stream in container.streams:
                if stream.type == 'video':
                    assert isinstance(stream, av.VideoStream)
                    return stream.width, stream.height
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def get_duration(self) -> float:
        """
        Returns the duration of the video in seconds.
        Returns:
            Duration in seconds
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode="r") as container:
            if container.duration is not None:
                return float(container.duration / av.time_base)
            # Fallback: calculate from frame count and frame rate
            video_stream = next(
                (s for s in container.streams if s.type == "video"), None
            )
            if video_stream and video_stream.frames and video_stream.average_rate:
                return float(video_stream.frames / video_stream.average_rate)
            # Last resort: decode frames to count them
            if video_stream and video_stream.average_rate:
                frame_count = 0
                container.seek(0)
                for packet in container.demux(video_stream):
                    for _ in packet.decode():
                        frame_count += 1
                if frame_count > 0:
                    return float(frame_count / video_stream.average_rate)
        raise ValueError(f"Could not determine duration for file '{self.__file}'")
    def get_components_internal(self, container: InputContainer) -> VideoComponents:
        # Get video frames
        frames = []
        for frame in container.decode(video=0):
            img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
            img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
            frames.append(img)
        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
        # Get frame rate
        video_stream = next(s for s in container.streams if s.type == 'video')
        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
        # Get audio if available
        audio = None
        try:
            container.seek(0)  # Reset the container to the beginning
            for stream in container.streams:
                if stream.type != 'audio':
                    continue
                assert isinstance(stream, av.AudioStream)
                audio_frames = []
                for packet in container.demux(stream):
                    for frame in packet.decode():
                        assert isinstance(frame, av.AudioFrame)
                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
                if len(audio_frames) > 0:
                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
                    audio = AudioInput({
                        "waveform": audio_tensor,
                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
                    })
        except StopIteration:
            pass  # No audio stream
        metadata = container.metadata
        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
    def get_components(self) -> VideoComponents:
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            return self.get_components_internal(container)
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def save_to(
        self,
        path: str | io.BytesIO,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            container_format = container.format.name
            video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None
            reuse_streams = True
            if format != VideoContainer.AUTO and format not in container_format.split(","):
                reuse_streams = False
            if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
                reuse_streams = False
            if not reuse_streams:
                components = self.get_components_internal(container)
                video = VideoFromComponents(components)
                return video.save_to(
                    path,
                    format=format,
                    codec=codec,
                    metadata=metadata
                )
            streams = container.streams
            open_kwargs = get_open_write_kwargs(path, container_format, format)
            with av.open(path, **open_kwargs) as output_container:
                # Copy over the original metadata
                for key, value in container.metadata.items():
                    if metadata is None or key not in metadata:
                        output_container.metadata[key] = value
                # Add our new metadata
                if metadata is not None:
                    for key, value in metadata.items():
                        if isinstance(value, str):
                            output_container.metadata[key] = value
                        else:
                            output_container.metadata[key] = json.dumps(value)
                # Add streams to the new container
                stream_map = {}
                for stream in streams:
                    if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)):
                        out_stream = output_container.add_stream_from_template(template=stream, opaque=True)
                        stream_map[stream] = out_stream
                # Write packets to the new container
                for packet in container.demux():
                    if packet.stream in stream_map and packet.dts is not None:
                        packet.stream = stream_map[packet.stream]
                        output_container.mux(packet)
 class VideoFromComponents(VideoInput):
    """
    Class representing video input from tensors.
    """
    def __init__(self, components: VideoComponents):
        self.__components = components
    def get_components(self) -> VideoComponents:
        return VideoComponents(
            images=self.__components.images,
            audio=self.__components.audio,
            frame_rate=self.__components.frame_rate
        )
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
            raise ValueError("Only MP4 format is supported for now")
        if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
            raise ValueError("Only H264 codec is supported for now")
        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
            # Add metadata before writing any streams
            if metadata is not None:
                for key, value in metadata.items():
                    output.metadata[key] = json.dumps(value)
            frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
            # Create a video stream
            video_stream = output.add_stream('h264', rate=frame_rate)
            video_stream.width = self.__components.images.shape[2]
            video_stream.height = self.__components.images.shape[1]
            video_stream.pix_fmt = 'yuv420p'
            # Create an audio stream
            audio_sample_rate = 1
            audio_stream: Optional[av.AudioStream] = None
            if self.__components.audio:
                audio_sample_rate = int(self.__components.audio['sample_rate'])
                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
                audio_stream.sample_rate = audio_sample_rate
                audio_stream.format = 'fltp'
            # Encode video
            for i, frame in enumerate(self.__components.images):
                img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
                frame = av.VideoFrame.from_ndarray(img, format='rgb24')
                frame = frame.reformat(format='yuv420p')  # Convert to YUV420P as required by h264
                packet = video_stream.encode(frame)
                output.mux(packet)
            # Flush video
            packet = video_stream.encode(None)
            output.mux(packet)
            if audio_stream and self.__components.audio:
                # Encode audio
                samples_per_frame = int(audio_sample_rate / frame_rate)
                num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame
                for i in range(num_frames):
                    start = i * samples_per_frame
                    end = start + samples_per_frame
                    # TODO(Feature) - Add support for stereo audio
                    chunk = (
                        self.__components.audio["waveform"][0, 0, start:end]
                        .unsqueeze(0)
                        .contiguous()
                        .numpy()
                    )
                    audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
                    audio_frame.sample_rate = audio_sample_rate
                    audio_frame.pts = i * samples_per_frame
                    for packet in audio_stream.encode(audio_frame):
                        output.mux(packet)
                # Flush audio
                for packet in audio_stream.encode(None):
                    output.mux(packet)
--- a/comfy_api/internal/init.py
+++ b/comfy_api/internal/init.py
@ -0,0 +1,150 @@
 # Internal infrastructure for ComfyAPI
 from .api_registry import (
    ComfyAPIBase as ComfyAPIBase,
    ComfyAPIWithVersion as ComfyAPIWithVersion,
    register_versions as register_versions,
    get_all_versions as get_all_versions,
 )
 import asyncio
 from dataclasses import asdict
 from typing import Callable, Optional
 def first_real_override(cls: type, name: str, *, base: type=None) -> Optional[Callable]:
    """Return the *callable* override of `name` visible on `cls`, or None if every
    implementation up to (and including) `base` is the placeholder defined on `base`.
    If base is not provided, it will assume cls has a GET_BASE_CLASS
    """
    if base is None:
        if not hasattr(cls, "GET_BASE_CLASS"):
            raise ValueError("base is required if cls does not have a GET_BASE_CLASS; is this a valid ComfyNode subclass?")
        base = cls.GET_BASE_CLASS()
    base_attr = getattr(base, name, None)
    if base_attr is None:
        return None
    base_func = base_attr.__func__
    for c in cls.mro():                       # NodeB, NodeA, ComfyNode, object …
        if c is base:                         # reached the placeholder – we're done
            break
        if name in c.__dict__:                # first class that *defines* the attr
            func = getattr(c, name).__func__
            if func is not base_func:         # real override
                return getattr(cls, name)     # bound to *cls*
    return None
 class _ComfyNodeInternal:
    """Class that all V3-based APIs inherit from for ComfyNode.
    This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward."""
    @classmethod
    def GET_NODE_INFO_V1(cls):
        ...
 class _NodeOutputInternal:
    """Class that all V3-based APIs inherit from for NodeOutput.
    This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward."""
    ...
 def as_pruned_dict(dataclass_obj):
    '''Return dict of dataclass object with pruned None values.'''
    return prune_dict(asdict(dataclass_obj))
 def prune_dict(d: dict):
    return {k: v for k,v in d.items() if v is not None}
 def is_class(obj):
    '''
    Returns True if is a class type.
    Returns False if is a class instance.
    '''
    return isinstance(obj, type)
 def copy_class(cls: type) -> type:
    '''
    Copy a class and its attributes.
    '''
    if cls is None:
        return None
    cls_dict = {
            k: v for k, v in cls.__dict__.items()
            if k not in ('__dict__', '__weakref__', '__module__', '__doc__')
        }
    # new class
    new_cls = type(
        cls.__name__,
        (cls,),
        cls_dict
    )
    # metadata preservation
    new_cls.__module__ = cls.__module__
    new_cls.__doc__ = cls.__doc__
    return new_cls
 class classproperty(object):
    def __init__(self, f):
        self.f = f
    def __get__(self, obj, owner):
        return self.f(owner)
 # NOTE: this was ai generated and validated by hand
 def shallow_clone_class(cls, new_name=None):
    '''
    Shallow clone a class while preserving super() functionality.
    '''
    new_name = new_name or f"{cls.__name__}Clone"
    # Include the original class in the bases to maintain proper inheritance
    new_bases = (cls,) + cls.__bases__
    return type(new_name, new_bases, dict(cls.__dict__))
 # NOTE: this was ai generated and validated by hand
 def lock_class(cls):
    '''
    Lock a class so that its top-levelattributes cannot be modified.
    '''
    # Locked instance __setattr__
    def locked_instance_setattr(self, name, value):
        raise AttributeError(
            f"Cannot set attribute '{name}' on immutable instance of {type(self).__name__}"
        )
    # Locked metaclass
    class LockedMeta(type(cls)):
        def __setattr__(cls_, name, value):
            raise AttributeError(
                f"Cannot modify class attribute '{name}' on locked class '{cls_.__name__}'"
            )
    # Rebuild class with locked behavior
    locked_dict = dict(cls.__dict__)
    locked_dict['__setattr__'] = locked_instance_setattr
    return LockedMeta(cls.__name__, cls.__bases__, locked_dict)
 def make_locked_method_func(type_obj, func, class_clone):
    """
    Returns a function that, when called with **inputs, will execute:
    getattr(type_obj, func).__func__(lock_class(class_clone), **inputs)
    Supports both synchronous and asynchronous methods.
    """
    locked_class = lock_class(class_clone)
    method = getattr(type_obj, func).__func__
    # Check if the original method is async
    if asyncio.iscoroutinefunction(method):
        async def wrapped_async_func(**inputs):
            return await method(locked_class, **inputs)
        return wrapped_async_func
    else:
        def wrapped_func(**inputs):
            return method(locked_class, **inputs)
        return wrapped_func
--- a/comfy_api/internal/api_registry.py
+++ b/comfy_api/internal/api_registry.py
@ -0,0 +1,39 @@
 from typing import Type, List, NamedTuple
 from comfy_api.internal.singleton import ProxiedSingleton
 from packaging import version as packaging_version
 class ComfyAPIBase(ProxiedSingleton):
    def __init__(self):
        pass
 class ComfyAPIWithVersion(NamedTuple):
    version: str
    api_class: Type[ComfyAPIBase]
 def parse_version(version_str: str) -> packaging_version.Version:
    """
    Parses a version string into a packaging_version.Version object.
    Raises ValueError if the version string is invalid.
    """
    if version_str == "latest":
        return packaging_version.parse("9999999.9999999.9999999")
    return packaging_version.parse(version_str)
 registered_versions: List[ComfyAPIWithVersion] = []
 def register_versions(versions: List[ComfyAPIWithVersion]):
    versions.sort(key=lambda x: parse_version(x.version))
    global registered_versions
    registered_versions = versions
 def get_all_versions() -> List[ComfyAPIWithVersion]:
    """
    Returns a list of all registered ComfyAPI versions.
    """
    return registered_versions
--- a/comfy_api/internal/async_to_sync.py
+++ b/comfy_api/internal/async_to_sync.py
@ -0,0 +1,987 @@
 import asyncio
 import concurrent.futures
 import contextvars
 import functools
 import inspect
 import logging
 import os
 import textwrap
 import threading
 from enum import Enum
 from typing import Optional, Type, get_origin, get_args
 class TypeTracker:
    """Tracks types discovered during stub generation for automatic import generation."""
    def __init__(self):
        self.discovered_types = {}  # type_name -> (module, qualname)
        self.builtin_types = {
            "Any",
            "Dict",
            "List",
            "Optional",
            "Tuple",
            "Union",
            "Set",
            "Sequence",
            "cast",
            "NamedTuple",
            "str",
            "int",
            "float",
            "bool",
            "None",
            "bytes",
            "object",
            "type",
            "dict",
            "list",
            "tuple",
            "set",
        }
        self.already_imported = (
            set()
        )  # Track types already imported to avoid duplicates
    def track_type(self, annotation):
        """Track a type annotation and record its module/import info."""
        if annotation is None or annotation is type(None):
            return
        # Skip builtins and typing module types we already import
        type_name = getattr(annotation, "__name__", None)
        if type_name and (
            type_name in self.builtin_types or type_name in self.already_imported
        ):
            return
        # Get module and qualname
        module = getattr(annotation, "__module__", None)
        qualname = getattr(annotation, "__qualname__", type_name or "")
        # Skip types from typing module (they're already imported)
        if module == "typing":
            return
        # Skip UnionType and GenericAlias from types module as they're handled specially
        if module == "types" and type_name in ("UnionType", "GenericAlias"):
            return
        if module and module not in ["builtins", "__main__"]:
            # Store the type info
            if type_name:
                self.discovered_types[type_name] = (module, qualname)
    def get_imports(self, main_module_name: str) -> list[str]:
        """Generate import statements for all discovered types."""
        imports = []
        imports_by_module = {}
        for type_name, (module, qualname) in sorted(self.discovered_types.items()):
            # Skip types from the main module (they're already imported)
            if main_module_name and module == main_module_name:
                continue
            if module not in imports_by_module:
                imports_by_module[module] = []
            if type_name not in imports_by_module[module]:  # Avoid duplicates
                imports_by_module[module].append(type_name)
        # Generate import statements
        for module, types in sorted(imports_by_module.items()):
            if len(types) == 1:
                imports.append(f"from {module} import {types[0]}")
            else:
                imports.append(f"from {module} import {', '.join(sorted(set(types)))}")
        return imports
 class AsyncToSyncConverter:
    """
    Provides utilities to convert async classes to sync classes with proper type hints.
    """
    _thread_pool: Optional[concurrent.futures.ThreadPoolExecutor] = None
    _thread_pool_lock = threading.Lock()
    _thread_pool_initialized = False
    @classmethod
    def get_thread_pool(cls, max_workers=None) -> concurrent.futures.ThreadPoolExecutor:
        """Get or create the shared thread pool with proper thread-safe initialization."""
        # Fast path - check if already initialized without acquiring lock
        if cls._thread_pool_initialized:
            assert cls._thread_pool is not None, "Thread pool should be initialized"
            return cls._thread_pool
        # Slow path - acquire lock and create pool if needed
        with cls._thread_pool_lock:
            if not cls._thread_pool_initialized:
                cls._thread_pool = concurrent.futures.ThreadPoolExecutor(
                    max_workers=max_workers, thread_name_prefix="async_to_sync_"
                )
                cls._thread_pool_initialized = True
        # This should never be None at this point, but add assertion for type checker
        assert cls._thread_pool is not None
        return cls._thread_pool
    @classmethod
    def run_async_in_thread(cls, coro_func, *args, **kwargs):
        """
        Run an async function in a separate thread from the thread pool.
        Blocks until the async function completes.
        Properly propagates contextvars between threads and manages event loops.
        """
        # Capture current context - this includes all context variables
        context = contextvars.copy_context()
        # Store the result and any exception that occurs
        result_container: dict = {"result": None, "exception": None}
        # Function that runs in the thread pool
        def run_in_thread():
            # Create new event loop for this thread
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            try:
                # Create the coroutine within the context
                async def run_with_context():
                    # The coroutine function might access context variables
                    return await coro_func(*args, **kwargs)
                # Run the coroutine with the captured context
                # This ensures all context variables are available in the async function
                result = context.run(loop.run_until_complete, run_with_context())
                result_container["result"] = result
            except Exception as e:
                # Store the exception to re-raise in the calling thread
                result_container["exception"] = e
            finally:
                # Ensure event loop is properly closed to prevent warnings
                try:
                    # Cancel any remaining tasks
                    pending = asyncio.all_tasks(loop)
                    for task in pending:
                        task.cancel()
                    # Run the loop briefly to handle cancellations
                    if pending:
                        loop.run_until_complete(
                            asyncio.gather(*pending, return_exceptions=True)
                        )
                except Exception:
                    pass  # Ignore errors during cleanup
                # Close the event loop
                loop.close()
                # Clear the event loop from the thread
                asyncio.set_event_loop(None)
        # Submit to thread pool and wait for result
        thread_pool = cls.get_thread_pool()
        future = thread_pool.submit(run_in_thread)
        future.result()  # Wait for completion
        # Re-raise any exception that occurred in the thread
        if result_container["exception"] is not None:
            raise result_container["exception"]
        return result_container["result"]
    @classmethod
    def create_sync_class(cls, async_class: Type, thread_pool_size=10) -> Type:
        """
        Creates a new class with synchronous versions of all async methods.
        Args:
            async_class: The async class to convert
            thread_pool_size: Size of thread pool to use
        Returns:
            A new class with sync versions of all async methods
        """
        sync_class_name = "ComfyAPISyncStub"
        cls.get_thread_pool(thread_pool_size)
        # Create a proper class with docstrings and proper base classes
        sync_class_dict = {
            "__doc__": async_class.__doc__,
            "__module__": async_class.__module__,
            "__qualname__": sync_class_name,
            "__orig_class__": async_class,  # Store original class for typing references
        }
        # Create __init__ method
        def __init__(self, *args, **kwargs):
            self._async_instance = async_class(*args, **kwargs)
            # Handle annotated class attributes (like execution: Execution)
            # Get all annotations from the class hierarchy
            all_annotations = {}
            for base_class in reversed(inspect.getmro(async_class)):
                if hasattr(base_class, "__annotations__"):
                    all_annotations.update(base_class.__annotations__)
            # For each annotated attribute, check if it needs to be created or wrapped
            for attr_name, attr_type in all_annotations.items():
                if hasattr(self._async_instance, attr_name):
                    # Attribute exists on the instance
                    attr = getattr(self._async_instance, attr_name)
                    # Check if this attribute needs a sync wrapper
                    if hasattr(attr, "__class__"):
                        from comfy_api.internal.singleton import ProxiedSingleton
                        if isinstance(attr, ProxiedSingleton):
                            # Create a sync version of this attribute
                            try:
                                sync_attr_class = cls.create_sync_class(attr.__class__)
                                # Create instance of the sync wrapper with the async instance
                                sync_attr = object.__new__(sync_attr_class)  # type: ignore
                                sync_attr._async_instance = attr
                                setattr(self, attr_name, sync_attr)
                            except Exception:
                                # If we can't create a sync version, keep the original
                                setattr(self, attr_name, attr)
                        else:
                            # Not async, just copy the reference
                            setattr(self, attr_name, attr)
                else:
                    # Attribute doesn't exist, but is annotated - create it
                    # This handles cases like execution: Execution
                    if isinstance(attr_type, type):
                        # Check if the type is defined as an inner class
                        if hasattr(async_class, attr_type.__name__):
                            inner_class = getattr(async_class, attr_type.__name__)
                            from comfy_api.internal.singleton import ProxiedSingleton
                            # Create an instance of the inner class
                            try:
                                # For ProxiedSingleton classes, get or create the singleton instance
                                if issubclass(inner_class, ProxiedSingleton):
                                    async_instance = inner_class.get_instance()
                                else:
                                    async_instance = inner_class()
                                # Create sync wrapper
                                sync_attr_class = cls.create_sync_class(inner_class)
                                sync_attr = object.__new__(sync_attr_class)  # type: ignore
                                sync_attr._async_instance = async_instance
                                setattr(self, attr_name, sync_attr)
                                # Also set on the async instance for consistency
                                setattr(self._async_instance, attr_name, async_instance)
                            except Exception as e:
                                logging.warning(
                                    f"Failed to create instance for {attr_name}: {e}"
                                )
            # Handle other instance attributes that might not be annotated
            for name, attr in inspect.getmembers(self._async_instance):
                if name.startswith("_") or hasattr(self, name):
                    continue
                # If attribute is an instance of a class, and that class is defined in the original class
                # we need to check if it needs a sync wrapper
                if isinstance(attr, object) and not isinstance(
                    attr, (str, int, float, bool, list, dict, tuple)
                ):
                    from comfy_api.internal.singleton import ProxiedSingleton
                    if isinstance(attr, ProxiedSingleton):
                        # Create a sync version of this nested class
                        try:
                            sync_attr_class = cls.create_sync_class(attr.__class__)
                            # Create instance of the sync wrapper with the async instance
                            sync_attr = object.__new__(sync_attr_class)  # type: ignore
                            sync_attr._async_instance = attr
                            setattr(self, name, sync_attr)
                        except Exception:
                            # If we can't create a sync version, keep the original
                            setattr(self, name, attr)
        sync_class_dict["__init__"] = __init__
        # Process methods from the async class
        for name, method in inspect.getmembers(
            async_class, predicate=inspect.isfunction
        ):
            if name.startswith("_"):
                continue
            # Extract the actual return type from a coroutine
            if inspect.iscoroutinefunction(method):
                # Create sync version of async method with proper signature
                @functools.wraps(method)
                def sync_method(self, *args, _method_name=name, **kwargs):
                    async_method = getattr(self._async_instance, _method_name)
                    return AsyncToSyncConverter.run_async_in_thread(
                        async_method, *args, **kwargs
                    )
                # Add to the class dict
                sync_class_dict[name] = sync_method
            else:
                # For regular methods, create a proxy method
                @functools.wraps(method)
                def proxy_method(self, *args, _method_name=name, **kwargs):
                    method = getattr(self._async_instance, _method_name)
                    return method(*args, **kwargs)
                # Add to the class dict
                sync_class_dict[name] = proxy_method
        # Handle property access
        for name, prop in inspect.getmembers(
            async_class, lambda x: isinstance(x, property)
        ):
            def make_property(name, prop_obj):
                def getter(self):
                    value = getattr(self._async_instance, name)
                    if inspect.iscoroutinefunction(value):
                        def sync_fn(*args, **kwargs):
                            return AsyncToSyncConverter.run_async_in_thread(
                                value, *args, **kwargs
                            )
                        return sync_fn
                    return value
                def setter(self, value):
                    setattr(self._async_instance, name, value)
                return property(getter, setter if prop_obj.fset else None)
            sync_class_dict[name] = make_property(name, prop)
        # Create the class
        sync_class = type(sync_class_name, (object,), sync_class_dict)
        return sync_class
    @classmethod
    def _format_type_annotation(
        cls, annotation, type_tracker: Optional[TypeTracker] = None
    ) -> str:
        """Convert a type annotation to its string representation for stub files."""
        if (
            annotation is inspect.Parameter.empty
            or annotation is inspect.Signature.empty
        ):
            return "Any"
        # Handle None type
        if annotation is type(None):
            return "None"
        # Track the type if we have a tracker
        if type_tracker:
            type_tracker.track_type(annotation)
        # Try using typing.get_origin/get_args for Python 3.8+
        try:
            origin = get_origin(annotation)
            args = get_args(annotation)
            if origin is not None:
                # Track the origin type
                if type_tracker:
                    type_tracker.track_type(origin)
                # Get the origin name
                origin_name = getattr(origin, "__name__", str(origin))
                if "." in origin_name:
                    origin_name = origin_name.split(".")[-1]
                # Special handling for types.UnionType (Python 3.10+ pipe operator)
                # Convert to old-style Union for compatibility
                if str(origin) == "<class 'types.UnionType'>" or origin_name == "UnionType":
                    origin_name = "Union"
                # Format arguments recursively
                if args:
                    formatted_args = []
                    for arg in args:
                        # Track each type in the union
                        if type_tracker:
                            type_tracker.track_type(arg)
                        formatted_args.append(cls._format_type_annotation(arg, type_tracker))
                    return f"{origin_name}[{', '.join(formatted_args)}]"
                else:
                    return origin_name
        except (AttributeError, TypeError):
            # Fallback for older Python versions or non-generic types
            pass
        # Handle generic types the old way for compatibility
        if hasattr(annotation, "__origin__") and hasattr(annotation, "__args__"):
            origin = annotation.__origin__
            origin_name = (
                origin.__name__
                if hasattr(origin, "__name__")
                else str(origin).split("'")[1]
            )
            # Format each type argument
            args = []
            for arg in annotation.__args__:
                args.append(cls._format_type_annotation(arg, type_tracker))
            return f"{origin_name}[{', '.join(args)}]"
        # Handle regular types with __name__
        if hasattr(annotation, "__name__"):
            return annotation.__name__
        # Handle special module types (like types from typing module)
        if hasattr(annotation, "__module__") and hasattr(annotation, "__qualname__"):
            # For types like typing.Literal, typing.TypedDict, etc.
            return annotation.__qualname__
        # Last resort: string conversion with cleanup
        type_str = str(annotation)
        # Clean up common patterns more robustly
        if type_str.startswith("<class '") and type_str.endswith("'>"):
            type_str = type_str[8:-2]  # Remove "<class '" and "'>"
        # Remove module prefixes for common modules
        for prefix in ["typing.", "builtins.", "types."]:
            if type_str.startswith(prefix):
                type_str = type_str[len(prefix) :]
        # Handle special cases
        if type_str in ("_empty", "inspect._empty"):
            return "None"
        # Fix NoneType (this should rarely be needed now)
        if type_str == "NoneType":
            return "None"
        return type_str
    @classmethod
    def _extract_coroutine_return_type(cls, annotation):
        """Extract the actual return type from a Coroutine annotation."""
        if hasattr(annotation, "__args__") and len(annotation.__args__) > 2:
            # Coroutine[Any, Any, ReturnType] -> extract ReturnType
            return annotation.__args__[2]
        return annotation
    @classmethod
    def _format_parameter_default(cls, default_value) -> str:
        """Format a parameter's default value for stub files."""
        if default_value is inspect.Parameter.empty:
            return ""
        elif default_value is None:
            return " = None"
        elif isinstance(default_value, bool):
            return f" = {default_value}"
        elif default_value == {}:
            return " = {}"
        elif default_value == []:
            return " = []"
        else:
            return f" = {default_value}"
    @classmethod
    def _format_method_parameters(
        cls,
        sig: inspect.Signature,
        skip_self: bool = True,
        type_hints: Optional[dict] = None,
        type_tracker: Optional[TypeTracker] = None,
    ) -> str:
        """Format method parameters for stub files."""
        params = []
        if type_hints is None:
            type_hints = {}
        for i, (param_name, param) in enumerate(sig.parameters.items()):
            if i == 0 and param_name == "self" and skip_self:
                params.append("self")
            else:
                # Get type annotation from type hints if available, otherwise from signature
                annotation = type_hints.get(param_name, param.annotation)
                type_str = cls._format_type_annotation(annotation, type_tracker)
                # Get default value
                default_str = cls._format_parameter_default(param.default)
                # Combine parameter parts
                if annotation is inspect.Parameter.empty:
                    params.append(f"{param_name}: Any{default_str}")
                else:
                    params.append(f"{param_name}: {type_str}{default_str}")
        return ", ".join(params)
    @classmethod
    def _generate_method_signature(
        cls,
        method_name: str,
        method,
        is_async: bool = False,
        type_tracker: Optional[TypeTracker] = None,
    ) -> str:
        """Generate a complete method signature for stub files."""
        sig = inspect.signature(method)
        # Try to get evaluated type hints to resolve string annotations
        try:
            from typing import get_type_hints
            type_hints = get_type_hints(method)
        except Exception:
            # Fallback to empty dict if we can't get type hints
            type_hints = {}
        # For async methods, extract the actual return type
        return_annotation = type_hints.get('return', sig.return_annotation)
        if is_async and inspect.iscoroutinefunction(method):
            return_annotation = cls._extract_coroutine_return_type(return_annotation)
        # Format parameters with type hints
        params_str = cls._format_method_parameters(sig, type_hints=type_hints, type_tracker=type_tracker)
        # Format return type
        return_type = cls._format_type_annotation(return_annotation, type_tracker)
        if return_annotation is inspect.Signature.empty:
            return_type = "None"
        return f"def {method_name}({params_str}) -> {return_type}: ..."
    @classmethod
    def _generate_imports(
        cls, async_class: Type, type_tracker: TypeTracker
    ) -> list[str]:
        """Generate import statements for the stub file."""
        imports = []
        # Add standard typing imports
        imports.append(
            "from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple"
        )
        # Add imports from the original module
        if async_class.__module__ != "builtins":
            module = inspect.getmodule(async_class)
            additional_types = []
            if module:
                # Check if module has __all__ defined
                module_all = getattr(module, "__all__", None)
                for name, obj in sorted(inspect.getmembers(module)):
                    if isinstance(obj, type):
                        # Skip if __all__ is defined and this name isn't in it
                        # unless it's already been tracked as used in type annotations
                        if module_all is not None and name not in module_all:
                            # Check if this type was actually used in annotations
                            if name not in type_tracker.discovered_types:
                                continue
                        # Check for NamedTuple
                        if issubclass(obj, tuple) and hasattr(obj, "_fields"):
                            additional_types.append(name)
                            # Mark as already imported
                            type_tracker.already_imported.add(name)
                        # Check for Enum
                        elif issubclass(obj, Enum) and name != "Enum":
                            additional_types.append(name)
                            # Mark as already imported
                            type_tracker.already_imported.add(name)
            if additional_types:
                type_imports = ", ".join([async_class.__name__] + additional_types)
                imports.append(f"from {async_class.__module__} import {type_imports}")
            else:
                imports.append(
                    f"from {async_class.__module__} import {async_class.__name__}"
                )
        # Add imports for all discovered types
        # Pass the main module name to avoid duplicate imports
        imports.extend(
            type_tracker.get_imports(main_module_name=async_class.__module__)
        )
        # Add base module import if needed
        if hasattr(inspect.getmodule(async_class), "__name__"):
            module_name = inspect.getmodule(async_class).__name__
            if "." in module_name:
                base_module = module_name.split(".")[0]
                # Only add if not already importing from it
                if not any(imp.startswith(f"from {base_module}") for imp in imports):
                    imports.append(f"import {base_module}")
        return imports
    @classmethod
    def _get_class_attributes(cls, async_class: Type) -> list[tuple[str, Type]]:
        """Extract class attributes that are classes themselves."""
        class_attributes = []
        # Look for class attributes that are classes
        for name, attr in sorted(inspect.getmembers(async_class)):
            if isinstance(attr, type) and not name.startswith("_"):
                class_attributes.append((name, attr))
            elif (
                hasattr(async_class, "__annotations__")
                and name in async_class.__annotations__
            ):
                annotation = async_class.__annotations__[name]
                if isinstance(annotation, type):
                    class_attributes.append((name, annotation))
        return class_attributes
    @classmethod
    def _generate_inner_class_stub(
        cls,
        name: str,
        attr: Type,
        indent: str = "    ",
        type_tracker: Optional[TypeTracker] = None,
    ) -> list[str]:
        """Generate stub for an inner class."""
        stub_lines = []
        stub_lines.append(f"{indent}class {name}Sync:")
        # Add docstring if available
        if hasattr(attr, "__doc__") and attr.__doc__:
            stub_lines.extend(
                cls._format_docstring_for_stub(attr.__doc__, f"{indent}    ")
            )
        # Add __init__ if it exists
        if hasattr(attr, "__init__"):
            try:
                init_method = getattr(attr, "__init__")
                init_sig = inspect.signature(init_method)
                # Try to get type hints
                try:
                    from typing import get_type_hints
                    init_hints = get_type_hints(init_method)
                except Exception:
                    init_hints = {}
                # Format parameters
                params_str = cls._format_method_parameters(
                    init_sig, type_hints=init_hints, type_tracker=type_tracker
                )
                # Add __init__ docstring if available (before the method)
                if hasattr(init_method, "__doc__") and init_method.__doc__:
                    stub_lines.extend(
                        cls._format_docstring_for_stub(
                            init_method.__doc__, f"{indent}    "
                        )
                    )
                stub_lines.append(
                    f"{indent}    def __init__({params_str}) -> None: ..."
                )
            except (ValueError, TypeError):
                stub_lines.append(
                    f"{indent}    def __init__(self, *args, **kwargs) -> None: ..."
                )
        # Add methods to the inner class
        has_methods = False
        for method_name, method in sorted(
            inspect.getmembers(attr, predicate=inspect.isfunction)
        ):
            if method_name.startswith("_"):
                continue
            has_methods = True
            try:
                # Add method docstring if available (before the method signature)
                if method.__doc__:
                    stub_lines.extend(
                        cls._format_docstring_for_stub(method.__doc__, f"{indent}    ")
                    )
                method_sig = cls._generate_method_signature(
                    method_name, method, is_async=True, type_tracker=type_tracker
                )
                stub_lines.append(f"{indent}    {method_sig}")
            except (ValueError, TypeError):
                stub_lines.append(
                    f"{indent}    def {method_name}(self, *args, **kwargs): ..."
                )
        if not has_methods:
            stub_lines.append(f"{indent}    pass")
        return stub_lines
    @classmethod
    def _format_docstring_for_stub(
        cls, docstring: str, indent: str = "    "
    ) -> list[str]:
        """Format a docstring for inclusion in a stub file with proper indentation."""
        if not docstring:
            return []
        # First, dedent the docstring to remove any existing indentation
        dedented = textwrap.dedent(docstring).strip()
        # Split into lines
        lines = dedented.split("\n")
        # Build the properly indented docstring
        result = []
        result.append(f'{indent}"""')
        for line in lines:
            if line.strip():  # Non-empty line
                result.append(f"{indent}{line}")
            else:  # Empty line
                result.append("")
        result.append(f'{indent}"""')
        return result
    @classmethod
    def _post_process_stub_content(cls, stub_content: list[str]) -> list[str]:
        """Post-process stub content to fix any remaining issues."""
        processed = []
        for line in stub_content:
            # Skip processing imports
            if line.startswith(("from ", "import ")):
                processed.append(line)
                continue
            # Fix method signatures missing return types
            if (
                line.strip().startswith("def ")
                and line.strip().endswith(": ...")
                and ") -> " not in line
            ):
                # Add -> None for methods without return annotation
                line = line.replace(": ...", " -> None: ...")
            processed.append(line)
        return processed
    @classmethod
    def generate_stub_file(cls, async_class: Type, sync_class: Type) -> None:
        """
        Generate a .pyi stub file for the sync class to help IDEs with type checking.
        """
        try:
            # Only generate stub if we can determine module path
            if async_class.__module__ == "__main__":
                return
            module = inspect.getmodule(async_class)
            if not module:
                return
            module_path = module.__file__
            if not module_path:
                return
            # Create stub file path in a 'generated' subdirectory
            module_dir = os.path.dirname(module_path)
            stub_dir = os.path.join(module_dir, "generated")
            # Ensure the generated directory exists
            os.makedirs(stub_dir, exist_ok=True)
            module_name = os.path.basename(module_path)
            if module_name.endswith(".py"):
                module_name = module_name[:-3]
            sync_stub_path = os.path.join(stub_dir, f"{sync_class.__name__}.pyi")
            # Create a type tracker for this stub generation
            type_tracker = TypeTracker()
            stub_content = []
            # We'll generate imports after processing all methods to capture all types
            # Leave a placeholder for imports
            imports_placeholder_index = len(stub_content)
            stub_content.append("")  # Will be replaced with imports later
            # Class definition
            stub_content.append(f"class {sync_class.__name__}:")
            # Docstring
            if async_class.__doc__:
                stub_content.extend(
                    cls._format_docstring_for_stub(async_class.__doc__, "    ")
                )
            # Generate __init__
            try:
                init_method = async_class.__init__
                init_signature = inspect.signature(init_method)
                # Try to get type hints for __init__
                try:
                    from typing import get_type_hints
                    init_hints = get_type_hints(init_method)
                except Exception:
                    init_hints = {}
                # Format parameters
                params_str = cls._format_method_parameters(
                    init_signature, type_hints=init_hints, type_tracker=type_tracker
                )
                # Add __init__ docstring if available (before the method)
                if hasattr(init_method, "__doc__") and init_method.__doc__:
                    stub_content.extend(
                        cls._format_docstring_for_stub(init_method.__doc__, "    ")
                    )
                stub_content.append(f"    def __init__({params_str}) -> None: ...")
            except (ValueError, TypeError):
                stub_content.append(
                    "    def __init__(self, *args, **kwargs) -> None: ..."
                )
            stub_content.append("")  # Add newline after __init__
            # Get class attributes
            class_attributes = cls._get_class_attributes(async_class)
            # Generate inner classes
            for name, attr in class_attributes:
                inner_class_stub = cls._generate_inner_class_stub(
                    name, attr, type_tracker=type_tracker
                )
                stub_content.extend(inner_class_stub)
                stub_content.append("")  # Add newline after the inner class
            # Add methods to the main class
            processed_methods = set()  # Keep track of methods we've processed
            for name, method in sorted(
                inspect.getmembers(async_class, predicate=inspect.isfunction)
            ):
                if name.startswith("_") or name in processed_methods:
                    continue
                processed_methods.add(name)
                try:
                    method_sig = cls._generate_method_signature(
                        name, method, is_async=True, type_tracker=type_tracker
                    )
                    # Add docstring if available (before the method signature for proper formatting)
                    if method.__doc__:
                        stub_content.extend(
                            cls._format_docstring_for_stub(method.__doc__, "    ")
                        )
                    stub_content.append(f"    {method_sig}")
                    stub_content.append("")  # Add newline after each method
                except (ValueError, TypeError):
                    # If we can't get the signature, just add a simple stub
                    stub_content.append(f"    def {name}(self, *args, **kwargs): ...")
                    stub_content.append("")  # Add newline
            # Add properties
            for name, prop in sorted(
                inspect.getmembers(async_class, lambda x: isinstance(x, property))
            ):
                stub_content.append("    @property")
                stub_content.append(f"    def {name}(self) -> Any: ...")
                if prop.fset:
                    stub_content.append(f"    @{name}.setter")
                    stub_content.append(
                        f"    def {name}(self, value: Any) -> None: ..."
                    )
                stub_content.append("")  # Add newline after each property
            # Add placeholders for the nested class instances
            # Check the actual attribute names from class annotations and attributes
            attribute_mappings = {}
            # First check annotations for typed attributes (including from parent classes)
            # Collect all annotations from the class hierarchy
            all_annotations = {}
            for base_class in reversed(inspect.getmro(async_class)):
                if hasattr(base_class, "__annotations__"):
                    all_annotations.update(base_class.__annotations__)
            for attr_name, attr_type in sorted(all_annotations.items()):
                for class_name, class_type in class_attributes:
                    # If the class type matches the annotated type
                    if (
                        attr_type == class_type
                        or (hasattr(attr_type, "__name__") and attr_type.__name__ == class_name)
                        or (isinstance(attr_type, str) and attr_type == class_name)
                    ):
                        attribute_mappings[class_name] = attr_name
            # Remove the extra checking - annotations should be sufficient
            # Add the attribute declarations with proper names
            for class_name, class_type in class_attributes:
                # Check if there's a mapping from annotation
                attr_name = attribute_mappings.get(class_name, class_name)
                # Use the annotation name if it exists, even if the attribute doesn't exist yet
                # This is because the attribute might be created at runtime
                stub_content.append(f"    {attr_name}: {class_name}Sync")
            stub_content.append("")  # Add a final newline
            # Now generate imports with all discovered types
            imports = cls._generate_imports(async_class, type_tracker)
            # Deduplicate imports while preserving order
            seen = set()
            unique_imports = []
            for imp in imports:
                if imp not in seen:
                    seen.add(imp)
                    unique_imports.append(imp)
                else:
                    logging.warning(f"Duplicate import detected: {imp}")
            # Replace the placeholder with actual imports
            stub_content[imports_placeholder_index : imports_placeholder_index + 1] = (
                unique_imports
            )
            # Post-process stub content
            stub_content = cls._post_process_stub_content(stub_content)
            # Write stub file
            with open(sync_stub_path, "w") as f:
                f.write("\n".join(stub_content))
            logging.info(f"Generated stub file: {sync_stub_path}")
        except Exception as e:
            # If stub generation fails, log the error but don't break the main functionality
            logging.error(
                f"Error generating stub file for {sync_class.__name__}: {str(e)}"
            )
            import traceback
            logging.error(traceback.format_exc())
 def create_sync_class(async_class: Type, thread_pool_size=10) -> Type:
    """
    Creates a sync version of an async class
    Args:
        async_class: The async class to convert
        thread_pool_size: Size of thread pool to use
    Returns:
        A new class with sync versions of all async methods
    """
    return AsyncToSyncConverter.create_sync_class(async_class, thread_pool_size)
--- a/comfy_api/internal/singleton.py
+++ b/comfy_api/internal/singleton.py
@ -0,0 +1,33 @@
 from typing import Type, TypeVar
 class SingletonMetaclass(type):
    T = TypeVar("T", bound="SingletonMetaclass")
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(SingletonMetaclass, cls).__call__(
                *args, **kwargs
            )
        return cls._instances[cls]
    def inject_instance(cls: Type[T], instance: T) -> None:
        assert cls not in SingletonMetaclass._instances, (
            "Cannot inject instance after first instantiation"
        )
        SingletonMetaclass._instances[cls] = instance
    def get_instance(cls: Type[T], *args, **kwargs) -> T:
        """
        Gets the singleton instance of the class, creating it if it doesn't exist.
        """
        if cls not in SingletonMetaclass._instances:
            SingletonMetaclass._instances[cls] = super(
                SingletonMetaclass, cls
            ).__call__(*args, **kwargs)
        return cls._instances[cls]
 class ProxiedSingleton(object, metaclass=SingletonMetaclass):
    def __init__(self):
        super().__init__()
--- a/comfy_api/latest/init.py
+++ b/comfy_api/latest/init.py
@ -0,0 +1,124 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Type, TYPE_CHECKING
 from comfy_api.internal import ComfyAPIBase
 from comfy_api.internal.singleton import ProxiedSingleton
 from comfy_api.internal.async_to_sync import create_sync_class
 from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
 from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
 from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents
 from comfy_api.latest._io import _IO as io  #noqa: F401
 from comfy_api.latest._ui import _UI as ui  #noqa: F401
 # from comfy_api.latest._resources import _RESOURCES as resources  #noqa: F401
 from comfy_execution.utils import get_executing_context
 from comfy_execution.progress import get_progress_state, PreviewImageTuple
 from PIL import Image
 from comfy.cli_args import args
 import numpy as np
 class ComfyAPI_latest(ComfyAPIBase):
    VERSION = "latest"
    STABLE = False
    class Execution(ProxiedSingleton):
        async def set_progress(
            self,
            value: float,
            max_value: float,
            node_id: str | None = None,
            preview_image: Image.Image | ImageInput | None = None,
            ignore_size_limit: bool = False,
        ) -> None:
            """
            Update the progress bar displayed in the ComfyUI interface.
            This function allows custom nodes and API calls to report their progress
            back to the user interface, providing visual feedback during long operations.
            Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
            """
            executing_context = get_executing_context()
            if node_id is None and executing_context is not None:
                node_id = executing_context.node_id
            if node_id is None:
                raise ValueError("node_id must be provided if not in executing context")
            # Convert preview_image to PreviewImageTuple if needed
            to_display: PreviewImageTuple | Image.Image | ImageInput | None = preview_image
            if to_display is not None:
                # First convert to PIL Image if needed
                if isinstance(to_display, ImageInput):
                    # Convert ImageInput (torch.Tensor) to PIL Image
                    # Handle tensor shape [B, H, W, C] -> get first image if batch
                    tensor = to_display
                    if len(tensor.shape) == 4:
                        tensor = tensor[0]
                    # Convert to numpy array and scale to 0-255
                    image_np = (tensor.cpu().numpy() * 255).astype(np.uint8)
                    to_display = Image.fromarray(image_np)
                if isinstance(to_display, Image.Image):
                    # Detect image format from PIL Image
                    image_format = to_display.format if to_display.format else "JPEG"
                    # Use None for preview_size if ignore_size_limit is True
                    preview_size = None if ignore_size_limit else args.preview_size
                    to_display = (image_format, to_display, preview_size)
            get_progress_state().update_progress(
                node_id=node_id,
                value=value,
                max_value=max_value,
                image=to_display,
            )
    execution: Execution
 class ComfyExtension(ABC):
    async def on_load(self) -> None:
        """
        Called when an extension is loaded.
        This should be used to initialize any global resources neeeded by the extension.
        """
    @abstractmethod
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        """
        Returns a list of nodes that this extension provides.
        """
 class Input:
    Image = ImageInput
    Audio = AudioInput
    Mask = MaskInput
    Latent = LatentInput
    Video = VideoInput
 class InputImpl:
    VideoFromFile = VideoFromFile
    VideoFromComponents = VideoFromComponents
 class Types:
    VideoCodec = VideoCodec
    VideoContainer = VideoContainer
    VideoComponents = VideoComponents
 ComfyAPI = ComfyAPI_latest
 # Create a synchronous version of the API
 if TYPE_CHECKING:
    import comfy_api.latest.generated.ComfyAPISyncStub  # type: ignore
    ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub]
 ComfyAPISync = create_sync_class(ComfyAPI_latest)
 __all__ = [
    "ComfyAPI",
    "ComfyAPISync",
    "Input",
    "InputImpl",
    "Types",
    "ComfyExtension",
 ]
--- a/comfy_api/latest/_input/init.py
+++ b/comfy_api/latest/_input/init.py
@ -0,0 +1,10 @@
 from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
 from .video_types import VideoInput
 __all__ = [
    "ImageInput",
    "AudioInput",
    "VideoInput",
    "MaskInput",
    "LatentInput",
 ]
--- a/comfy_api/latest/_input/basic_types.py
+++ b/comfy_api/latest/_input/basic_types.py
@ -0,0 +1,42 @@
 import torch
 from typing import TypedDict, List, Optional
 ImageInput = torch.Tensor
 """
 An image in format [B, H, W, C] where B is the batch size, C is the number of channels,
 """
 MaskInput = torch.Tensor
 """
 A mask in format [B, H, W] where B is the batch size
 """
 class AudioInput(TypedDict):
    """
    TypedDict representing audio input.
    """
    waveform: torch.Tensor
    """
    Tensor in the format [B, C, T] where B is the batch size, C is the number of channels,
    """
    sample_rate: int
 class LatentInput(TypedDict):
    """
    TypedDict representing latent input.
    """
    samples: torch.Tensor
    """
    Tensor in the format [B, C, H, W] where B is the batch size, C is the number of channels,
    H is the height, and W is the width.
    """
    noise_mask: Optional[MaskInput]
    """
    Optional noise mask tensor in the same format as samples.
    """
    batch_index: Optional[List[int]]
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@ -0,0 +1,85 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Optional, Union
 import io
 import av
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
 class VideoInput(ABC):
    """
    Abstract base class for video input types.
    """
    @abstractmethod
    def get_components(self) -> VideoComponents:
        """
        Abstract method to get the video components (images, audio, and frame rate).
        Returns:
            VideoComponents containing images, audio, and frame rate
        """
        pass
    @abstractmethod
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        """
        Abstract method to save the video input to a file.
        """
        pass
    def get_stream_source(self) -> Union[str, io.BytesIO]:
        """
        Get a streamable source for the video. This allows processing without
        loading the entire video into memory.
        Returns:
            Either a file path (str) or a BytesIO object that can be opened with av.
        Default implementation creates a BytesIO buffer, but subclasses should
        override this for better performance when possible.
        """
        buffer = io.BytesIO()
        self.save_to(buffer)
        buffer.seek(0)
        return buffer
    # Provide a default implementation, but subclasses can provide optimized versions
    # if possible.
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        components = self.get_components()
        return components.images.shape[2], components.images.shape[1]
    def get_duration(self) -> float:
        """
        Returns the duration of the video in seconds.
        Returns:
            Duration in seconds
        """
        components = self.get_components()
        frame_count = components.images.shape[0]
        return float(frame_count / components.frame_rate)
    def get_container_format(self) -> str:
        """
        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
        Returns:
            Container format as string
        """
        # Default implementation - subclasses should override for better performance
        source = self.get_stream_source()
        with av.open(source, mode="r") as container:
            return container.format.name
--- a/comfy_api/latest/_input_impl/init.py
+++ b/comfy_api/latest/_input_impl/init.py
@ -0,0 +1,7 @@
 from .video_types import VideoFromFile, VideoFromComponents
 __all__ = [
    # Implementations
    "VideoFromFile",
    "VideoFromComponents",
 ]
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@ -0,0 +1,324 @@
 from __future__ import annotations
 from av.container import InputContainer
 from av.subtitles.stream import SubtitleStream
 from fractions import Fraction
 from typing import Optional
 from comfy_api.latest._input import AudioInput, VideoInput
 import av
 import io
 import json
 import numpy as np
 import torch
 from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
 def container_to_output_format(container_format: str | None) -> str | None:
    """
    A container's `format` may be a comma-separated list of formats.
    E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`.
    However, writing to a file/stream with `av.open` requires a single format,
    or `None` to auto-detect.
    """
    if not container_format:
        return None  # Auto-detect
    if "," not in container_format:
        return container_format
    formats = container_format.split(",")
    return formats[0]
 def get_open_write_kwargs(
    dest: str | io.BytesIO, container_format: str, to_format: str | None
 ) -> dict:
    """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`"""
    open_kwargs = {
        "mode": "w",
        # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo)
        "options": {"movflags": "use_metadata_tags"},
    }
    is_write_to_buffer = isinstance(dest, io.BytesIO)
    if is_write_to_buffer:
        # Set output format explicitly, since it cannot be inferred from file extension
        if to_format == VideoContainer.AUTO:
            to_format = container_format.lower()
        elif isinstance(to_format, str):
            to_format = to_format.lower()
        open_kwargs["format"] = container_to_output_format(to_format)
    return open_kwargs
 class VideoFromFile(VideoInput):
    """
    Class representing video input from a file.
    """
    def __init__(self, file: str | io.BytesIO):
        """
        Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
        containing the file contents.
        """
        self.__file = file
    def get_stream_source(self) -> str | io.BytesIO:
        """
        Return the underlying file source for efficient streaming.
        This avoids unnecessary memory copies when the source is already a file path.
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        return self.__file
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            for stream in container.streams:
                if stream.type == 'video':
                    assert isinstance(stream, av.VideoStream)
                    return stream.width, stream.height
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def get_duration(self) -> float:
        """
        Returns the duration of the video in seconds.
        Returns:
            Duration in seconds
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode="r") as container:
            if container.duration is not None:
                return float(container.duration / av.time_base)
            # Fallback: calculate from frame count and frame rate
            video_stream = next(
                (s for s in container.streams if s.type == "video"), None
            )
            if video_stream and video_stream.frames and video_stream.average_rate:
                return float(video_stream.frames / video_stream.average_rate)
            # Last resort: decode frames to count them
            if video_stream and video_stream.average_rate:
                frame_count = 0
                container.seek(0)
                for packet in container.demux(video_stream):
                    for _ in packet.decode():
                        frame_count += 1
                if frame_count > 0:
                    return float(frame_count / video_stream.average_rate)
        raise ValueError(f"Could not determine duration for file '{self.__file}'")
    def get_container_format(self) -> str:
        """
        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
        Returns:
            Container format as string
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode='r') as container:
            return container.format.name
    def get_components_internal(self, container: InputContainer) -> VideoComponents:
        # Get video frames
        frames = []
        for frame in container.decode(video=0):
            img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
            img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
            frames.append(img)
        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
        # Get frame rate
        video_stream = next(s for s in container.streams if s.type == 'video')
        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
        # Get audio if available
        audio = None
        try:
            container.seek(0)  # Reset the container to the beginning
            for stream in container.streams:
                if stream.type != 'audio':
                    continue
                assert isinstance(stream, av.AudioStream)
                audio_frames = []
                for packet in container.demux(stream):
                    for frame in packet.decode():
                        assert isinstance(frame, av.AudioFrame)
                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
                if len(audio_frames) > 0:
                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
                    audio = AudioInput({
                        "waveform": audio_tensor,
                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
                    })
        except StopIteration:
            pass  # No audio stream
        metadata = container.metadata
        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
    def get_components(self) -> VideoComponents:
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            return self.get_components_internal(container)
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def save_to(
        self,
        path: str | io.BytesIO,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            container_format = container.format.name
            video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None
            reuse_streams = True
            if format != VideoContainer.AUTO and format not in container_format.split(","):
                reuse_streams = False
            if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
                reuse_streams = False
            if not reuse_streams:
                components = self.get_components_internal(container)
                video = VideoFromComponents(components)
                return video.save_to(
                    path,
                    format=format,
                    codec=codec,
                    metadata=metadata
                )
            streams = container.streams
            open_kwargs = get_open_write_kwargs(path, container_format, format)
            with av.open(path, **open_kwargs) as output_container:
                # Copy over the original metadata
                for key, value in container.metadata.items():
                    if metadata is None or key not in metadata:
                        output_container.metadata[key] = value
                # Add our new metadata
                if metadata is not None:
                    for key, value in metadata.items():
                        if isinstance(value, str):
                            output_container.metadata[key] = value
                        else:
                            output_container.metadata[key] = json.dumps(value)
                # Add streams to the new container
                stream_map = {}
                for stream in streams:
                    if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)):
                        out_stream = output_container.add_stream_from_template(template=stream, opaque=True)
                        stream_map[stream] = out_stream
                # Write packets to the new container
                for packet in container.demux():
                    if packet.stream in stream_map and packet.dts is not None:
                        packet.stream = stream_map[packet.stream]
                        output_container.mux(packet)
 class VideoFromComponents(VideoInput):
    """
    Class representing video input from tensors.
    """
    def __init__(self, components: VideoComponents):
        self.__components = components
    def get_components(self) -> VideoComponents:
        return VideoComponents(
            images=self.__components.images,
            audio=self.__components.audio,
            frame_rate=self.__components.frame_rate
        )
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
            raise ValueError("Only MP4 format is supported for now")
        if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
            raise ValueError("Only H264 codec is supported for now")
        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
            # Add metadata before writing any streams
            if metadata is not None:
                for key, value in metadata.items():
                    output.metadata[key] = json.dumps(value)
            frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
            # Create a video stream
            video_stream = output.add_stream('h264', rate=frame_rate)
            video_stream.width = self.__components.images.shape[2]
            video_stream.height = self.__components.images.shape[1]
            video_stream.pix_fmt = 'yuv420p'
            # Create an audio stream
            audio_sample_rate = 1
            audio_stream: Optional[av.AudioStream] = None
            if self.__components.audio:
                audio_sample_rate = int(self.__components.audio['sample_rate'])
                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
                audio_stream.sample_rate = audio_sample_rate
                audio_stream.format = 'fltp'
            # Encode video
            for i, frame in enumerate(self.__components.images):
                img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
                frame = av.VideoFrame.from_ndarray(img, format='rgb24')
                frame = frame.reformat(format='yuv420p')  # Convert to YUV420P as required by h264
                packet = video_stream.encode(frame)
                output.mux(packet)
            # Flush video
            packet = video_stream.encode(None)
            output.mux(packet)
            if audio_stream and self.__components.audio:
                # Encode audio
                samples_per_frame = int(audio_sample_rate / frame_rate)
                num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame
                for i in range(num_frames):
                    start = i * samples_per_frame
                    end = start + samples_per_frame
                    # TODO(Feature) - Add support for stereo audio
                    chunk = (
                        self.__components.audio["waveform"][0, 0, start:end]
                        .unsqueeze(0)
                        .contiguous()
                        .numpy()
                    )
                    audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
                    audio_frame.sample_rate = audio_sample_rate
                    audio_frame.pts = i * samples_per_frame
                    for packet in audio_stream.encode(audio_frame):
                        output.mux(packet)
                # Flush audio
                for packet in audio_stream.encode(None):
                    output.mux(packet)
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
--- a/comfy_api/latest/_resources.py
+++ b/comfy_api/latest/_resources.py
@ -0,0 +1,72 @@
 from __future__ import annotations
 import comfy.utils
 import folder_paths
 import logging
 from abc import ABC, abstractmethod
 from typing import Any
 import torch
 class ResourceKey(ABC):
    Type = Any
    def __init__(self):
        ...
 class TorchDictFolderFilename(ResourceKey):
    '''Key for requesting a torch file via file_name from a folder category.'''
    Type = dict[str, torch.Tensor]
    def __init__(self, folder_name: str, file_name: str):
        self.folder_name = folder_name
        self.file_name = file_name
    def __hash__(self):
        return hash((self.folder_name, self.file_name))
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, TorchDictFolderFilename):
            return False
        return self.folder_name == other.folder_name and self.file_name == other.file_name
    def __str__(self):
        return f"{self.folder_name} -> {self.file_name}"
 class Resources(ABC):
    def __init__(self):
        ...
    @abstractmethod
    def get(self, key: ResourceKey, default: Any=...) -> Any:
        pass
 class ResourcesLocal(Resources):
    def __init__(self):
        super().__init__()
        self.local_resources: dict[ResourceKey, Any] = {}
    def get(self, key: ResourceKey, default: Any=...) -> Any:
        cached = self.local_resources.get(key, None)
        if cached is not None:
            logging.info(f"Using cached resource '{key}'")
            return cached
        logging.info(f"Loading resource '{key}'")
        to_return = None
        if isinstance(key, TorchDictFolderFilename):
            if default is ...:
                to_return = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise(key.folder_name, key.file_name), safe_load=True)
            else:
                full_path = folder_paths.get_full_path(key.folder_name, key.file_name)
                if full_path is not None:
                    to_return = comfy.utils.load_torch_file(full_path, safe_load=True)
        if to_return is not None:
            self.local_resources[key] = to_return
            return to_return
        if default is not ...:
            return default
        raise Exception(f"Unsupported resource key type: {type(key)}")
 class _RESOURCES:
    ResourceKey = ResourceKey
    TorchDictFolderFilename = TorchDictFolderFilename
    Resources = Resources
    ResourcesLocal = ResourcesLocal
--- a/comfy_api/latest/_ui.py
+++ b/comfy_api/latest/_ui.py
@ -0,0 +1,457 @@
 from __future__ import annotations
 import json
 import os
 import random
 from io import BytesIO
 from typing import Type
 import av
 import numpy as np
 import torch
 import torchaudio
 from PIL import Image as PILImage
 from PIL.PngImagePlugin import PngInfo
 import folder_paths
 # used for image preview
 from comfy.cli_args import args
 from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput
 class SavedResult(dict):
    def __init__(self, filename: str, subfolder: str, type: FolderType):
        super().__init__(filename=filename, subfolder=subfolder,type=type.value)
    @property
    def filename(self) -> str:
        return self["filename"]
    @property
    def subfolder(self) -> str:
        return self["subfolder"]
    @property
    def type(self) -> FolderType:
        return FolderType(self["type"])
 class SavedImages(_UIOutput):
    """A UI output class to represent one or more saved images, potentially animated."""
    def __init__(self, results: list[SavedResult], is_animated: bool = False):
        super().__init__()
        self.results = results
        self.is_animated = is_animated
    def as_dict(self) -> dict:
        data = {"images": self.results}
        if self.is_animated:
            data["animated"] = (True,)
        return data
 class SavedAudios(_UIOutput):
    """UI wrapper around one or more audio files on disk (FLAC / MP3 / Opus)."""
    def __init__(self, results: list[SavedResult]):
        super().__init__()
        self.results = results
    def as_dict(self) -> dict:
        return {"audio": self.results}
 def _get_directory_by_folder_type(folder_type: FolderType) -> str:
    if folder_type == FolderType.input:
        return folder_paths.get_input_directory()
    if folder_type == FolderType.output:
        return folder_paths.get_output_directory()
    return folder_paths.get_temp_directory()
 class ImageSaveHelper:
    """A helper class with static methods to handle image saving and metadata."""
    @staticmethod
    def _convert_tensor_to_pil(image_tensor: torch.Tensor) -> PILImage.Image:
        """Converts a single torch tensor to a PIL Image."""
        return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8))
    @staticmethod
    def _create_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None:
        """Creates a PngInfo object with prompt and extra_pnginfo."""
        if args.disable_metadata or cls is None or not cls.hidden:
            return None
        metadata = PngInfo()
        if cls.hidden.prompt:
            metadata.add_text("prompt", json.dumps(cls.hidden.prompt))
        if cls.hidden.extra_pnginfo:
            for x in cls.hidden.extra_pnginfo:
                metadata.add_text(x, json.dumps(cls.hidden.extra_pnginfo[x]))
        return metadata
    @staticmethod
    def _create_animated_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None:
        """Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG)."""
        if args.disable_metadata or cls is None or not cls.hidden:
            return None
        metadata = PngInfo()
        if cls.hidden.prompt:
            metadata.add(
                b"comf",
                "prompt".encode("latin-1", "strict")
                + b"\0"
                + json.dumps(cls.hidden.prompt).encode("latin-1", "strict"),
                after_idat=True,
            )
        if cls.hidden.extra_pnginfo:
            for x in cls.hidden.extra_pnginfo:
                metadata.add(
                    b"comf",
                    x.encode("latin-1", "strict")
                    + b"\0"
                    + json.dumps(cls.hidden.extra_pnginfo[x]).encode("latin-1", "strict"),
                    after_idat=True,
                )
        return metadata
    @staticmethod
    def _create_webp_metadata(pil_image: PILImage.Image, cls: Type[ComfyNode] | None) -> PILImage.Exif:
        """Creates EXIF metadata bytes for WebP images."""
        exif_data = pil_image.getexif()
        if args.disable_metadata or cls is None or cls.hidden is None:
            return exif_data
        if cls.hidden.prompt is not None:
            exif_data[0x0110] = "prompt:{}".format(json.dumps(cls.hidden.prompt))  # EXIF 0x0110 = Model
        if cls.hidden.extra_pnginfo is not None:
            inital_exif_tag = 0x010F  # EXIF 0x010f = Make
            for key, value in cls.hidden.extra_pnginfo.items():
                exif_data[inital_exif_tag] = "{}:{}".format(key, json.dumps(value))
                inital_exif_tag -= 1
        return exif_data
    @staticmethod
    def save_images(
        images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, compress_level = 4,
    ) -> list[SavedResult]:
        """Saves a batch of images as individual PNG files."""
        full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
            filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0]
        )
        results = []
        metadata = ImageSaveHelper._create_png_metadata(cls)
        for batch_number, image_tensor in enumerate(images):
            img = ImageSaveHelper._convert_tensor_to_pil(image_tensor)
            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
            file = f"{filename_with_batch_num}_{counter:05}_.png"
            img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=compress_level)
            results.append(SavedResult(file, subfolder, folder_type))
            counter += 1
        return results
    @staticmethod
    def get_save_images_ui(images, filename_prefix: str, cls: Type[ComfyNode] | None, compress_level=4) -> SavedImages:
        """Saves a batch of images and returns a UI object for the node output."""
        return SavedImages(
                ImageSaveHelper.save_images(
                images,
                filename_prefix=filename_prefix,
                folder_type=FolderType.output,
                cls=cls,
                compress_level=compress_level,
            )
        )
    @staticmethod
    def save_animated_png(
        images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, fps: float, compress_level: int
    ) -> SavedResult:
        """Saves a batch of images as a single animated PNG."""
        full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
            filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0]
        )
        pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images]
        metadata = ImageSaveHelper._create_animated_png_metadata(cls)
        file = f"{filename}_{counter:05}_.png"
        save_path = os.path.join(full_output_folder, file)
        pil_images[0].save(
            save_path,
            pnginfo=metadata,
            compress_level=compress_level,
            save_all=True,
            duration=int(1000.0 / fps),
            append_images=pil_images[1:],
        )
        return SavedResult(file, subfolder, folder_type)
    @staticmethod
    def get_save_animated_png_ui(
        images, filename_prefix: str, cls: Type[ComfyNode] | None, fps: float, compress_level: int
    ) -> SavedImages:
        """Saves an animated PNG and returns a UI object for the node output."""
        result = ImageSaveHelper.save_animated_png(
            images,
            filename_prefix=filename_prefix,
            folder_type=FolderType.output,
            cls=cls,
            fps=fps,
            compress_level=compress_level,
        )
        return SavedImages([result], is_animated=len(images) > 1)
    @staticmethod
    def save_animated_webp(
        images,
        filename_prefix: str,
        folder_type: FolderType,
        cls: Type[ComfyNode] | None,
        fps: float,
        lossless: bool,
        quality: int,
        method: int,
    ) -> SavedResult:
        """Saves a batch of images as a single animated WebP."""
        full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
            filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0]
        )
        pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images]
        pil_exif = ImageSaveHelper._create_webp_metadata(pil_images[0], cls)
        file = f"{filename}_{counter:05}_.webp"
        pil_images[0].save(
            os.path.join(full_output_folder, file),
            save_all=True,
            duration=int(1000.0 / fps),
            append_images=pil_images[1:],
            exif=pil_exif,
            lossless=lossless,
            quality=quality,
            method=method,
        )
        return SavedResult(file, subfolder, folder_type)
    @staticmethod
    def get_save_animated_webp_ui(
        images,
        filename_prefix: str,
        cls: Type[ComfyNode] | None,
        fps: float,
        lossless: bool,
        quality: int,
        method: int,
    ) -> SavedImages:
        """Saves an animated WebP and returns a UI object for the node output."""
        result = ImageSaveHelper.save_animated_webp(
            images,
            filename_prefix=filename_prefix,
            folder_type=FolderType.output,
            cls=cls,
            fps=fps,
            lossless=lossless,
            quality=quality,
            method=method,
        )
        return SavedImages([result], is_animated=len(images) > 1)
 class AudioSaveHelper:
    """A helper class with static methods to handle audio saving and metadata."""
    _OPUS_RATES = [8000, 12000, 16000, 24000, 48000]
    @staticmethod
    def save_audio(
        audio: dict,
        filename_prefix: str,
        folder_type: FolderType,
        cls: Type[ComfyNode] | None,
        format: str = "flac",
        quality: str = "128k",
    ) -> list[SavedResult]:
        full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path(
            filename_prefix, _get_directory_by_folder_type(folder_type)
        )
        metadata = {}
        if not args.disable_metadata and cls is not None:
            if cls.hidden.prompt is not None:
                metadata["prompt"] = json.dumps(cls.hidden.prompt)
            if cls.hidden.extra_pnginfo is not None:
                for x in cls.hidden.extra_pnginfo:
                    metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
        results = []
        for batch_number, waveform in enumerate(audio["waveform"].cpu()):
            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
            file = f"{filename_with_batch_num}_{counter:05}_.{format}"
            output_path = os.path.join(full_output_folder, file)
            # Use original sample rate initially
            sample_rate = audio["sample_rate"]
            # Handle Opus sample rate requirements
            if format == "opus":
                if sample_rate > 48000:
                    sample_rate = 48000
                elif sample_rate not in AudioSaveHelper._OPUS_RATES:
                    # Find the next highest supported rate
                    for rate in sorted(AudioSaveHelper._OPUS_RATES):
                        if rate > sample_rate:
                            sample_rate = rate
                            break
                    if sample_rate not in AudioSaveHelper._OPUS_RATES:  # Fallback if still not supported
                        sample_rate = 48000
                # Resample if necessary
                if sample_rate != audio["sample_rate"]:
                    waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate)
            # Create output with specified format
            output_buffer = BytesIO()
            output_container = av.open(output_buffer, mode="w", format=format)
            # Set metadata on the container
            for key, value in metadata.items():
                output_container.metadata[key] = value
            # Set up the output stream with appropriate properties
            if format == "opus":
                out_stream = output_container.add_stream("libopus", rate=sample_rate)
                if quality == "64k":
                    out_stream.bit_rate = 64000
                elif quality == "96k":
                    out_stream.bit_rate = 96000
                elif quality == "128k":
                    out_stream.bit_rate = 128000
                elif quality == "192k":
                    out_stream.bit_rate = 192000
                elif quality == "320k":
                    out_stream.bit_rate = 320000
            elif format == "mp3":
                out_stream = output_container.add_stream("libmp3lame", rate=sample_rate)
                if quality == "V0":
                    # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool
                    out_stream.codec_context.qscale = 1
                elif quality == "128k":
                    out_stream.bit_rate = 128000
                elif quality == "320k":
                    out_stream.bit_rate = 320000
            else:  # format == "flac":
                out_stream = output_container.add_stream("flac", rate=sample_rate)
            frame = av.AudioFrame.from_ndarray(
                waveform.movedim(0, 1).reshape(1, -1).float().numpy(),
                format="flt",
                layout="mono" if waveform.shape[0] == 1 else "stereo",
            )
            frame.sample_rate = sample_rate
            frame.pts = 0
            output_container.mux(out_stream.encode(frame))
            # Flush encoder
            output_container.mux(out_stream.encode(None))
            # Close containers
            output_container.close()
            # Write the output to file
            output_buffer.seek(0)
            with open(output_path, "wb") as f:
                f.write(output_buffer.getbuffer())
            results.append(SavedResult(file, subfolder, folder_type))
            counter += 1
        return results
    @staticmethod
    def get_save_audio_ui(
        audio, filename_prefix: str, cls: Type[ComfyNode] | None, format: str = "flac", quality: str = "128k",
    ) -> SavedAudios:
        """Save and instantly wrap for UI."""
        return SavedAudios(
            AudioSaveHelper.save_audio(
                audio,
                filename_prefix=filename_prefix,
                folder_type=FolderType.output,
                cls=cls,
                format=format,
                quality=quality,
            )
        )
 class PreviewImage(_UIOutput):
    def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNode] = None, **kwargs):
        self.values = ImageSaveHelper.save_images(
            image,
            filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)),
            folder_type=FolderType.temp,
            cls=cls,
            compress_level=1,
        )
        self.animated = animated
    def as_dict(self):
        return {
            "images": self.values,
            "animated": (self.animated,)
        }
 class PreviewMask(PreviewImage):
    def __init__(self, mask: PreviewMask.Type, animated: bool=False, cls: ComfyNode=None, **kwargs):
        preview = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3)
        super().__init__(preview, animated, cls, **kwargs)
 class PreviewAudio(_UIOutput):
    def __init__(self, audio: dict, cls: Type[ComfyNode] = None, **kwargs):
        self.values = AudioSaveHelper.save_audio(
            audio,
            filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)),
            folder_type=FolderType.temp,
            cls=cls,
            format="flac",
            quality="128k",
        )
    def as_dict(self) -> dict:
        return {"audio": self.values}
 class PreviewVideo(_UIOutput):
    def __init__(self, values: list[SavedResult | dict], **kwargs):
        self.values = values
    def as_dict(self):
        return {"images": self.values, "animated": (True,)}
 class PreviewUI3D(_UIOutput):
    def __init__(self, model_file, camera_info, **kwargs):
        self.model_file = model_file
        self.camera_info = camera_info
    def as_dict(self):
        return {"result": [self.model_file, self.camera_info]}
 class PreviewText(_UIOutput):
    def __init__(self, value: str, **kwargs):
        self.value = value
    def as_dict(self):
        return {"text": (self.value,)}
 class _UI:
    SavedResult = SavedResult
    SavedImages = SavedImages
    SavedAudios = SavedAudios
    ImageSaveHelper = ImageSaveHelper
    AudioSaveHelper = AudioSaveHelper
    PreviewImage = PreviewImage
    PreviewMask = PreviewMask
    PreviewAudio = PreviewAudio
    PreviewVideo = PreviewVideo
    PreviewUI3D = PreviewUI3D
    PreviewText = PreviewText
--- a/comfy_api/latest/_util/init.py
+++ b/comfy_api/latest/_util/init.py
@ -0,0 +1,8 @@
 from .video_types import VideoContainer, VideoCodec, VideoComponents
 __all__ = [
    # Utility Types
    "VideoContainer",
    "VideoCodec",
    "VideoComponents",
 ]
--- a/comfy_api/latest/_util/video_types.py
+++ b/comfy_api/latest/_util/video_types.py
@ -0,0 +1,52 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
 from fractions import Fraction
 from typing import Optional
 from comfy_api.latest._input import ImageInput, AudioInput
 class VideoCodec(str, Enum):
    AUTO = "auto"
    H264 = "h264"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of codec names that can be used as node input.
        """
        return [member.value for member in cls]
 class VideoContainer(str, Enum):
    AUTO = "auto"
    MP4 = "mp4"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of container names that can be used as node input.
        """
        return [member.value for member in cls]
    @classmethod
    def get_extension(cls, value) -> str:
        """
        Returns the file extension for the container.
        """
        if isinstance(value, str):
            value = cls(value)
        if value == VideoContainer.MP4 or value == VideoContainer.AUTO:
            return "mp4"
        return ""
@dataclass
 class VideoComponents:
    """
    Dataclass representing the components of a video.
    """
    images: ImageInput
    frame_rate: Fraction
    audio: Optional[AudioInput] = None
    metadata: Optional[dict] = None
--- a/comfy_api/latest/generated/ComfyAPISyncStub.pyi
+++ b/comfy_api/latest/generated/ComfyAPISyncStub.pyi
@ -0,0 +1,20 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple
 from comfy_api.latest import ComfyAPI_latest
 from PIL.Image import Image
 from torch import Tensor
 class ComfyAPISyncStub:
    def __init__(self) -> None: ...
    class ExecutionSync:
        def __init__(self) -> None: ...
        """
        Update the progress bar displayed in the ComfyUI interface.
        This function allows custom nodes and API calls to report their progress
        back to the user interface, providing visual feedback during long operations.
        Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
        """
        def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ...
    execution: ExecutionSync
--- a/comfy_api/util.py
+++ b/comfy_api/util.py
@ -0,0 +1,8 @@
 # This file only exists for backwards compatibility.
 from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents
 __all__ = [
    "VideoCodec",
    "VideoContainer",
    "VideoComponents",
 ]
--- a/comfy_api/util/init.py
+++ b/comfy_api/util/init.py
@ -1,7 +1,7 @@
-from .video_types import VideoContainer, VideoCodec, VideoComponents
+# This file only exists for backwards compatibility.
 from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
 __all__ = [
    # Utility Types
    "VideoContainer",
    "VideoCodec",
    "VideoComponents",
--- a/comfy_api/util/video_types.py
+++ b/comfy_api/util/video_types.py
@ -1,51 +1,12 @@
-from __future__ import annotations
+# This file only exists for backwards compatibility.
-from dataclasses import dataclass
+from comfy_api.latest._util.video_types import (
-from enum import Enum
+    VideoContainer,
-from fractions import Fraction
+    VideoCodec,
-from typing import Optional
+    VideoComponents,
-from comfy_api.input import ImageInput, AudioInput
+)
 class VideoCodec(str, Enum):
    AUTO = "auto"
    H264 = "h264"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of codec names that can be used as node input.
        """
        return [member.value for member in cls]
 class VideoContainer(str, Enum):
    AUTO = "auto"
    MP4 = "mp4"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of container names that can be used as node input.
        """
        return [member.value for member in cls]
    @classmethod
    def get_extension(cls, value) -> str:
        """
        Returns the file extension for the container.
        """
        if isinstance(value, str):
            value = cls(value)
        if value == VideoContainer.MP4 or value == VideoContainer.AUTO:
            return "mp4"
        return ""
@dataclass
 class VideoComponents:
    """
    Dataclass representing the components of a video.
    """
    images: ImageInput
    frame_rate: Fraction
    audio: Optional[AudioInput] = None
    metadata: Optional[dict] = None
 __all__ = [
    "VideoContainer",
    "VideoCodec",
    "VideoComponents",
 ]
--- a/comfy_api/v0_0_1/init.py
+++ b/comfy_api/v0_0_1/init.py
@ -0,0 +1,42 @@
 from comfy_api.v0_0_2 import (
    ComfyAPIAdapter_v0_0_2,
    Input as Input_v0_0_2,
    InputImpl as InputImpl_v0_0_2,
    Types as Types_v0_0_2,
 )
 from typing import Type, TYPE_CHECKING
 from comfy_api.internal.async_to_sync import create_sync_class
 # This version only exists to serve as a template for future version adapters.
 # There is no reason anyone should ever use it.
 class ComfyAPIAdapter_v0_0_1(ComfyAPIAdapter_v0_0_2):
    VERSION = "0.0.1"
    STABLE = True
 class Input(Input_v0_0_2):
    pass
 class InputImpl(InputImpl_v0_0_2):
    pass
 class Types(Types_v0_0_2):
    pass
 ComfyAPI = ComfyAPIAdapter_v0_0_1
 # Create a synchronous version of the API
 if TYPE_CHECKING:
    from comfy_api.v0_0_1.generated.ComfyAPISyncStub import ComfyAPISyncStub  # type: ignore
    ComfyAPISync: Type[ComfyAPISyncStub]
 ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_1)
 __all__ = [
    "ComfyAPI",
    "ComfyAPISync",
    "Input",
    "InputImpl",
    "Types",
 ]
--- a/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi
+++ b/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi
@ -0,0 +1,20 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple
 from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1
 from PIL.Image import Image
 from torch import Tensor
 class ComfyAPISyncStub:
    def __init__(self) -> None: ...
    class ExecutionSync:
        def __init__(self) -> None: ...
        """
        Update the progress bar displayed in the ComfyUI interface.
        This function allows custom nodes and API calls to report their progress
        back to the user interface, providing visual feedback during long operations.
        Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
        """
        def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ...
    execution: ExecutionSync
--- a/comfy_api/v0_0_2/init.py
+++ b/comfy_api/v0_0_2/init.py
@ -0,0 +1,45 @@
 from comfy_api.latest import (
    ComfyAPI_latest,
    Input as Input_latest,
    InputImpl as InputImpl_latest,
    Types as Types_latest,
 )
 from typing import Type, TYPE_CHECKING
 from comfy_api.internal.async_to_sync import create_sync_class
 from comfy_api.latest import io, ui, ComfyExtension  #noqa: F401
 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest):
    VERSION = "0.0.2"
    STABLE = False
 class Input(Input_latest):
    pass
 class InputImpl(InputImpl_latest):
    pass
 class Types(Types_latest):
    pass
 ComfyAPI = ComfyAPIAdapter_v0_0_2
 # Create a synchronous version of the API
 if TYPE_CHECKING:
    from comfy_api.v0_0_2.generated.ComfyAPISyncStub import ComfyAPISyncStub  # type: ignore
    ComfyAPISync: Type[ComfyAPISyncStub]
 ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_2)
 __all__ = [
    "ComfyAPI",
    "ComfyAPISync",
    "Input",
    "InputImpl",
    "Types",
    "ComfyExtension",
 ]
--- a/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi
+++ b/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi
@ -0,0 +1,20 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple
 from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2
 from PIL.Image import Image
 from torch import Tensor
 class ComfyAPISyncStub:
    def __init__(self) -> None: ...
    class ExecutionSync:
        def __init__(self) -> None: ...
        """
        Update the progress bar displayed in the ComfyUI interface.
        This function allows custom nodes and API calls to report their progress
        back to the user interface, providing visual feedback during long operations.
        Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK
        """
        def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ...
    execution: ExecutionSync
--- a/comfy_api/version_list.py
+++ b/comfy_api/version_list.py
@ -0,0 +1,12 @@
 from comfy_api.latest import ComfyAPI_latest
 from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2
 from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1
 from comfy_api.internal import ComfyAPIBase
 from typing import List, Type
 supported_versions: List[Type[ComfyAPIBase]] = [
    ComfyAPI_latest,
    ComfyAPIAdapter_v0_0_2,
    ComfyAPIAdapter_v0_0_1,
 ]
--- a/comfy_api_nodes/README.md
+++ b/comfy_api_nodes/README.md
@ -2,7 +2,7 @@
 ## Introduction 
-Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview#api-nodes).
+Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview).
 ## Development
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  filtered-openapi.yaml
-#   timestamp: 2025-05-19T21:38:55+00:00
+#   timestamp: 2025-07-06T09:47:31+00:00
 from __future__ import annotations
@ -1355,6 +1355,158 @@ class ModelResponseProperties(BaseModel):
    )
 class Keyframes(BaseModel):
    image_url: Optional[str] = None
 class MoonvalleyPromptResponse(BaseModel):
    error: Optional[Dict[str, Any]] = None
    frame_conditioning: Optional[Dict[str, Any]] = None
    id: Optional[str] = None
    inference_params: Optional[Dict[str, Any]] = None
    meta: Optional[Dict[str, Any]] = None
    model_params: Optional[Dict[str, Any]] = None
    output_url: Optional[str] = None
    prompt_text: Optional[str] = None
    status: Optional[str] = None
 class MoonvalleyTextToVideoInferenceParams(BaseModel):
    add_quality_guidance: Optional[bool] = Field(
        True, description='Whether to add quality guidance'
    )
    caching_coefficient: Optional[float] = Field(
        0.3, description='Caching coefficient for optimization'
    )
    caching_cooldown: Optional[int] = Field(
        3, description='Number of caching cooldown steps'
    )
    caching_warmup: Optional[int] = Field(
        3, description='Number of caching warmup steps'
    )
    clip_value: Optional[float] = Field(
        3, description='CLIP value for generation control'
    )
    conditioning_frame_index: Optional[int] = Field(
        0, description='Index of the conditioning frame'
    )
    cooldown_steps: Optional[int] = Field(
        None, description='Number of cooldown steps (calculated based on num_frames)'
    )
    fps: Optional[int] = Field(
        24, description='Frames per second of the generated video'
    )
    guidance_scale: Optional[float] = Field(
        12.5, description='Guidance scale for generation control'
    )
    height: Optional[int] = Field(
        1080, description='Height of the generated video in pixels'
    )
    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
    num_frames: Optional[int] = Field(64, description='Number of frames to generate')
    seed: Optional[int] = Field(
        None, description='Random seed for generation (default: random)'
    )
    shift_value: Optional[float] = Field(
        3, description='Shift value for generation control'
    )
    steps: Optional[int] = Field(80, description='Number of denoising steps')
    use_guidance_schedule: Optional[bool] = Field(
        True, description='Whether to use guidance scheduling'
    )
    use_negative_prompts: Optional[bool] = Field(
        False, description='Whether to use negative prompts'
    )
    use_timestep_transform: Optional[bool] = Field(
        True, description='Whether to use timestep transformation'
    )
    warmup_steps: Optional[int] = Field(
        None, description='Number of warmup steps (calculated based on num_frames)'
    )
    width: Optional[int] = Field(
        1920, description='Width of the generated video in pixels'
    )
 class MoonvalleyTextToVideoRequest(BaseModel):
    image_url: Optional[str] = None
    inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
    prompt_text: Optional[str] = None
    webhook_url: Optional[str] = None
 class MoonvalleyUploadFileRequest(BaseModel):
    file: Optional[StrictBytes] = None
 class MoonvalleyUploadFileResponse(BaseModel):
    access_url: Optional[str] = None
 class MoonvalleyVideoToVideoInferenceParams(BaseModel):
    add_quality_guidance: Optional[bool] = Field(
        True, description='Whether to add quality guidance'
    )
    caching_coefficient: Optional[float] = Field(
        0.3, description='Caching coefficient for optimization'
    )
    caching_cooldown: Optional[int] = Field(
        3, description='Number of caching cooldown steps'
    )
    caching_warmup: Optional[int] = Field(
        3, description='Number of caching warmup steps'
    )
    clip_value: Optional[float] = Field(
        3, description='CLIP value for generation control'
    )
    conditioning_frame_index: Optional[int] = Field(
        0, description='Index of the conditioning frame'
    )
    cooldown_steps: Optional[int] = Field(
        None, description='Number of cooldown steps (calculated based on num_frames)'
    )
    guidance_scale: Optional[float] = Field(
        12.5, description='Guidance scale for generation control'
    )
    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
    seed: Optional[int] = Field(
        None, description='Random seed for generation (default: random)'
    )
    shift_value: Optional[float] = Field(
        3, description='Shift value for generation control'
    )
    steps: Optional[int] = Field(80, description='Number of denoising steps')
    use_guidance_schedule: Optional[bool] = Field(
        True, description='Whether to use guidance scheduling'
    )
    use_negative_prompts: Optional[bool] = Field(
        False, description='Whether to use negative prompts'
    )
    use_timestep_transform: Optional[bool] = Field(
        True, description='Whether to use timestep transformation'
    )
    warmup_steps: Optional[int] = Field(
        None, description='Number of warmup steps (calculated based on num_frames)'
    )
 class ControlType(str, Enum):
    motion_control = 'motion_control'
    pose_control = 'pose_control'
 class MoonvalleyVideoToVideoRequest(BaseModel):
    control_type: ControlType = Field(
        ..., description='Supported types for video control'
    )
    inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None
    prompt_text: str = Field(..., description='Describes the video to generate')
    video_url: str = Field(..., description='Url to control video')
    webhook_url: Optional[str] = Field(
        None, description='Optional webhook URL for notifications'
    )
 class Moderation(str, Enum):
    low = 'low'
    auto = 'auto'
@ -3107,6 +3259,23 @@ class LumaUpscaleVideoGenerationRequest(BaseModel):
    resolution: Optional[LumaVideoModelOutputResolution] = None
 class MoonvalleyImageToVideoRequest(MoonvalleyTextToVideoRequest):
    keyframes: Optional[Dict[str, Keyframes]] = None
 class MoonvalleyResizeVideoRequest(MoonvalleyVideoToVideoRequest):
    frame_position: Optional[List[int]] = Field(None, max_length=2, min_length=2)
    frame_resolution: Optional[List[int]] = Field(None, max_length=2, min_length=2)
    scale: Optional[List[int]] = Field(None, max_length=2, min_length=2)
 class MoonvalleyTextToImageRequest(BaseModel):
    image_url: Optional[str] = None
    inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
    prompt_text: Optional[str] = None
    webhook_url: Optional[str] = None
 class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]):
    root: Union[OutputTextContent, OutputAudioContent]
--- a/comfy_api_nodes/apis/request_logger.py
+++ b/comfy_api_nodes/apis/request_logger.py
@ -1,3 +1,5 @@
 from __future__ import annotations
 import os
 import datetime
 import json
--- a/comfy_api_nodes/nodes_bfl.py
+++ b/comfy_api_nodes/nodes_bfl.py
@ -346,20 +346,6 @@ class FluxKontextProImageNode(ComfyNodeABC):
            },
        }
    @classmethod
    def VALIDATE_INPUTS(cls, aspect_ratio: str):
        try:
            validate_aspect_ratio(
                aspect_ratio,
                minimum_ratio=cls.MINIMUM_RATIO,
                maximum_ratio=cls.MAXIMUM_RATIO,
                minimum_ratio_str=cls.MINIMUM_RATIO_STR,
                maximum_ratio_str=cls.MAXIMUM_RATIO_STR,
            )
        except Exception as e:
            return str(e)
        return True
    RETURN_TYPES = (IO.IMAGE,)
    DESCRIPTION = cleandoc(__doc__ or "")  # Handle potential None value
    FUNCTION = "api_call"
@ -380,6 +366,13 @@ class FluxKontextProImageNode(ComfyNodeABC):
        unique_id: Union[str, None] = None,
        **kwargs,
    ):
        aspect_ratio = validate_aspect_ratio(
            aspect_ratio,
            minimum_ratio=self.MINIMUM_RATIO,
            maximum_ratio=self.MAXIMUM_RATIO,
            minimum_ratio_str=self.MINIMUM_RATIO_STR,
            maximum_ratio_str=self.MAXIMUM_RATIO_STR,
        )
        if input_image is None:
            validate_string(prompt, strip_whitespace=False)
        operation = SynchronousOperation(
@ -395,13 +388,7 @@ class FluxKontextProImageNode(ComfyNodeABC):
                guidance=round(guidance, 1),
                steps=steps,
                seed=seed,
-                aspect_ratio=validate_aspect_ratio(
+                aspect_ratio=aspect_ratio,
                    aspect_ratio,
                    minimum_ratio=self.MINIMUM_RATIO,
                    maximum_ratio=self.MAXIMUM_RATIO,
                    minimum_ratio_str=self.MINIMUM_RATIO_STR,
                    maximum_ratio_str=self.MAXIMUM_RATIO_STR,
                ),
                input_image=(
                    input_image
                    if input_image is None
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@ -2,6 +2,8 @@
 API Nodes for Gemini Multimodal LLM Usage via Remote API
 See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
 """
 from __future__ import annotations
 import os
 from enum import Enum
@ -406,7 +408,7 @@ class GeminiInputFiles(ComfyNodeABC):
    def create_file_part(self, file_path: str) -> GeminiPart:
        mime_type = (
-            GeminiMimeType.pdf
+            GeminiMimeType.application_pdf
            if file_path.endswith(".pdf")
            else GeminiMimeType.text_plain
        )
--- a/comfy_api_nodes/nodes_ideogram.py
+++ b/comfy_api_nodes/nodes_ideogram.py
@ -324,7 +324,7 @@ class IdeogramV1(ComfyNodeABC):
    RETURN_TYPES = (IO.IMAGE,)
    FUNCTION = "api_call"
-    CATEGORY = "api node/image/Ideogram/v1"
+    CATEGORY = "api node/image/Ideogram"
    DESCRIPTION = cleandoc(__doc__ or "")
    API_NODE = True
@ -483,7 +483,7 @@ class IdeogramV2(ComfyNodeABC):
    RETURN_TYPES = (IO.IMAGE,)
    FUNCTION = "api_call"
-    CATEGORY = "api node/image/Ideogram/v2"
+    CATEGORY = "api node/image/Ideogram"
    DESCRIPTION = cleandoc(__doc__ or "")
    API_NODE = True
@ -649,7 +649,7 @@ class IdeogramV3(ComfyNodeABC):
    RETURN_TYPES = (IO.IMAGE,)
    FUNCTION = "api_call"
-    CATEGORY = "api node/image/Ideogram/v3"
+    CATEGORY = "api node/image/Ideogram"
    DESCRIPTION = cleandoc(__doc__ or "")
    API_NODE = True
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -132,6 +132,8 @@ def poll_until_finished(
        result_url_extractor=result_url_extractor,
        estimated_duration=estimated_duration,
        node_id=node_id,
        poll_interval=16.0,
        max_poll_attempts=256,
    ).execute()
--- a/Show More
+++ b/Show More