Merge branch 'master' into worksplit-multigpu-wip

2025-08-02 03:07:07 +00:00 · 2025-07-27 01:03:25 -07:00 · 2025-07-27 01:03:25 -07:00 · 3b90a30178
commit 3b90a30178
parent 3c4104652b 1ef70fcde4
23 changed files with 1123 additions and 259 deletions
--- a/.github/workflows/check-line-endings.yml
+++ b/.github/workflows/check-line-endings.yml
@ -17,8 +17,7 @@ jobs:
      - name: Check for Windows line endings (CRLF)
        run: |
          # Get the list of changed files in the PR
-          git merge origin/${{ github.base_ref }} --no-edit
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
          CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD)
          # Flag to track if CRLF is found
          CRLF_FOUND=false
--- a/README.md
+++ b/README.md
@ -294,6 +294,13 @@ For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a
 2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
 3. Launch ComfyUI by running `python main.py`
 #### Iluvatar Corex
 For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method:
 1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
 2. Launch ComfyUI by running `python main.py`
 # Running
 ```python main.py```
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -29,18 +29,48 @@ def frontend_install_warning_message():
 This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
 """.strip()
 def parse_version(version: str) -> tuple[int, int, int]:
        return tuple(map(int, version.split(".")))
 def is_valid_version(version: str) -> bool:
    """Validate if a string is a valid semantic version (X.Y.Z format)."""
    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
    return bool(re.match(pattern, version))
 def get_installed_frontend_version():
    """Get the currently installed frontend package version."""
    frontend_version_str = version("comfyui-frontend-package")
    return frontend_version_str
 def get_required_frontend_version():
    """Get the required frontend version from requirements.txt."""
    try:
        with open(requirements_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("comfyui-frontend-package=="):
                    version_str = line.split("==")[-1]
                    if not is_valid_version(version_str):
                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
                        return None
                    return version_str
            logging.error("comfyui-frontend-package not found in requirements.txt")
            return None
    except FileNotFoundError:
        logging.error("requirements.txt not found. Cannot determine required frontend version.")
        return None
    except Exception as e:
        logging.error(f"Error reading requirements.txt: {e}")
        return None
 def check_frontend_version():
    """Check if the frontend version is up to date."""
    def parse_version(version: str) -> tuple[int, int, int]:
        return tuple(map(int, version.split(".")))
    try:
-        frontend_version_str = version("comfyui-frontend-package")
+        frontend_version_str = get_installed_frontend_version()
        frontend_version = parse_version(frontend_version_str)
-        with open(requirements_path, "r", encoding="utf-8") as f:
+        required_frontend_str = get_required_frontend_version()
-            required_frontend = parse_version(f.readline().split("=")[-1])
+        required_frontend = parse_version(required_frontend_str)
        if frontend_version < required_frontend:
            app.logger.log_startup_warning(
                f"""
@ -168,6 +198,11 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:
 class FrontendManager:
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
    @classmethod
    def get_required_frontend_version(cls) -> str:
        """Get the required frontend package version."""
        return get_required_frontend_version()
    @classmethod
    def default_frontend_path(cls) -> str:
        try:
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -49,7 +49,8 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use.")
+parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use. All other devices will not be visible.")
 parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -1210,39 +1210,21 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None,
    return x_next
@torch.no_grad()
 def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
    extra_args = {} if extra_args is None else extra_args
    temp = [0]
    def post_cfg_function(args):
        temp[0] = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        sigma_hat = sigmas[i]
        denoised = model(x, sigma_hat * s_in, **extra_args)
        d = to_d(x, sigma_hat, temp[0])
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
        # Euler method
        x = denoised + d * sigmas[i + 1]
    return x
@torch.no_grad()
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps."""
+    """Ancestral sampling with Euler method steps (CFG++)."""
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    temp = [0]
+    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    uncond_denoised = None
    def post_cfg_function(args):
-        temp[0] = args["uncond_denoised"]
+        nonlocal uncond_denoised
        uncond_denoised = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
@ -1251,15 +1233,33 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], temp[0])
+        if sigmas[i + 1] == 0:
-        # Euler method
+            # Denoising step
-        x = denoised + d * sigma_down
+            x = denoised
-        if sigmas[i + 1] > 0:
+        else:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+            alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp()
            alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp()
            d = to_d(x, sigmas[i], alpha_s * uncond_denoised)   # to noise
            # DDIM stochastic sampling
            sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta)
            sigma_down = alpha_t * sigma_down
            # Euler method
            x = alpha_t * denoised + sigma_down * d
            if eta > 0 and s_noise > 0:
                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
@torch.no_grad()
 def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
    """Euler method steps (CFG++)."""
    return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None)
@torch.no_grad()
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@ -52,15 +52,6 @@ class RMS_norm(nn.Module):
            x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0)
 class Upsample(nn.Upsample):
    def forward(self, x):
        """
        Fix bfloat16 support for nearest neighbor interpolation.
        """
        return super().forward(x.float()).type_as(x)
 class Resample(nn.Module):
    def __init__(self, dim, mode):
@ -73,11 +64,11 @@ class Resample(nn.Module):
        # layers
        if mode == 'upsample2d':
            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                ops.Conv2d(dim, dim // 2, 3, padding=1))
        elif mode == 'upsample3d':
            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                ops.Conv2d(dim, dim // 2, 3, padding=1))
            self.time_conv = CausalConv3d(
                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
@ -157,29 +148,6 @@ class Resample(nn.Module):
                    feat_idx[0] += 1
        return x
    def init_weight(self, conv):
        conv_weight = conv.weight
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        one_matrix = torch.eye(c1, c2)
        init_matrix = one_matrix
        nn.init.zeros_(conv_weight)
        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
    def init_weight2(self, conv):
        conv_weight = conv.weight.data
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        init_matrix = torch.eye(c1 // 2, c2)
        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
 class ResidualBlock(nn.Module):
@ -494,12 +462,6 @@ class WanVAE(nn.Module):
        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
                                 attn_scales, self.temperal_upsample, dropout)
    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x_recon = self.decode(z)
        return x_recon, mu, log_var
    def encode(self, x):
        self.clear_cache()
        ## cache
@ -545,18 +507,6 @@ class WanVAE(nn.Module):
        self.clear_cache()
        return out
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps * std + mu
    def sample(self, imgs, deterministic=False):
        mu, log_var = self.encode(imgs)
        if deterministic:
            return mu
        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
        return mu + std * torch.randn_like(std)
    def clear_cache(self):
        self._conv_num = count_conv3d(self.decoder)
        self._conv_idx = [0]
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -106,7 +106,7 @@ if args.directml is not None:
    lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
 try:
-    import intel_extension_for_pytorch as ipex
+    import intel_extension_for_pytorch as ipex  # noqa: F401
    _ = torch.xpu.device_count()
    xpu_available = xpu_available or torch.xpu.is_available()
 except:
@ -133,6 +133,11 @@ try:
 except:
    mlu_available = False
 try:
    ixuca_available = hasattr(torch, "corex")
 except:
    ixuca_available = False
 if args.cpu:
    cpu_state = CPUState.CPU
@ -156,6 +161,12 @@ def is_mlu():
        return True
    return False
 def is_ixuca():
    global ixuca_available
    if ixuca_available:
        return True
    return False
 def get_torch_device():
    global directml_enabled
    global cpu_state
@ -210,8 +221,9 @@ def get_total_memory(dev=None, torch_total_too=False):
        elif is_intel_xpu():
            stats = torch.xpu.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
            mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
            mem_total_torch = mem_reserved
-            mem_total = torch.xpu.get_device_properties(dev).total_memory
+            mem_total = mem_total_xpu
        elif is_ascend_npu():
            stats = torch.npu.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
@ -312,7 +324,7 @@ try:
        if torch_version_numeric[0] >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu() or is_ascend_npu() or is_mlu():
+    if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca():
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
            ENABLE_PYTORCH_ATTENTION = True
 except:
@ -331,7 +343,10 @@ try:
        logging.info("ROCm version: {}".format(rocm_version))
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
            if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx1201 and gfx950
+                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
                    ENABLE_PYTORCH_ATTENTION = True
            if torch_version_numeric >= (2, 8):
                if any((a in arch) for a in ["gfx1201"]):
                    ENABLE_PYTORCH_ATTENTION = True
        if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
@ -401,6 +416,8 @@ def get_torch_device_name(device):
            except:
                allocator_backend = ""
            return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend)
        elif device.type == "xpu":
            return "{} {}".format(device, torch.xpu.get_device_name(device))
        else:
            return "{}".format(device.type)
    elif is_intel_xpu():
@ -904,6 +921,7 @@ def vae_dtype(device=None, allowed_dtypes=[]):
            return d
        # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
        # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
        if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
            return d
@ -957,7 +975,7 @@ def device_supports_non_blocking(device):
    if is_device_mps(device):
        return False #pytorch bug? mps doesn't support non blocking
    if is_intel_xpu():
-        return False
+        return True
    if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews)
        return False
    if directml_enabled:
@ -996,6 +1014,8 @@ def get_offload_stream(device):
        stream_counter = (stream_counter + 1) % len(ss)
        if is_device_cuda(device):
            ss[stream_counter].wait_stream(torch.cuda.current_stream())
        elif is_device_xpu(device):
            ss[stream_counter].wait_stream(torch.xpu.current_stream())
        stream_counters[device] = stream_counter
        return s
    elif is_device_cuda(device):
@ -1007,6 +1027,15 @@ def get_offload_stream(device):
        stream_counter = (stream_counter + 1) % len(ss)
        stream_counters[device] = stream_counter
        return s
    elif is_device_xpu(device):
        ss = []
        for k in range(NUM_STREAMS):
            ss.append(torch.xpu.Stream(device=device, priority=0))
        STREAMS[device] = ss
        s = ss[stream_counter]
        stream_counter = (stream_counter + 1) % len(ss)
        stream_counters[device] = stream_counter
        return s
    return None
 def sync_stream(device, stream):
@ -1014,6 +1043,8 @@ def sync_stream(device, stream):
        return
    if is_device_cuda(device):
        torch.cuda.current_stream().wait_stream(stream)
    elif is_device_xpu(device):
        torch.xpu.current_stream().wait_stream(stream)
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
    if device is None or weight.device == device:
@ -1055,6 +1086,8 @@ def xformers_enabled():
        return False
    if is_mlu():
        return False
    if is_ixuca():
        return False
    if directml_enabled:
        return False
    return XFORMERS_IS_AVAILABLE
@ -1090,6 +1123,8 @@ def pytorch_attention_flash_attention():
            return True
        if is_amd():
            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
        if is_ixuca():
            return True
    return False
 def force_upcast_attention_dtype():
@ -1120,8 +1155,8 @@ def get_free_memory(dev=None, torch_free_too=False):
            stats = torch.xpu.memory_stats(dev)
            mem_active = stats['active_bytes.all.current']
            mem_reserved = stats['reserved_bytes.all.current']
            mem_free_torch = mem_reserved - mem_active
            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
            mem_free_torch = mem_reserved - mem_active
            mem_free_total = mem_free_xpu + mem_free_torch
        elif is_ascend_npu():
            stats = torch.npu.memory_stats(dev)
@ -1170,6 +1205,9 @@ def is_device_cpu(device):
 def is_device_mps(device):
    return is_device_type(device, 'mps')
 def is_device_xpu(device):
    return is_device_type(device, 'xpu')
 def is_device_cuda(device):
    return is_device_type(device, 'cuda')
@ -1201,7 +1239,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
        return False
    if is_intel_xpu():
-        return True
+        if torch_version_numeric < (2, 3):
            return True
        else:
            return torch.xpu.get_device_properties(device).has_fp16
    if is_ascend_npu():
        return True
@ -1209,6 +1250,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if is_mlu():
        return True
    if is_ixuca():
        return True
    if torch.version.hip:
        return True
@ -1264,11 +1308,17 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        return False
    if is_intel_xpu():
-        return True
+        if torch_version_numeric < (2, 6):
            return True
        else:
            return torch.xpu.get_device_capability(device)['has_bfloat16_conversions']
    if is_ascend_npu():
        return True
    if is_ixuca():
        return True
    if is_amd():
        arch = torch.cuda.get_device_properties(device).gcnArchName
        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -698,6 +698,26 @@ def resize_to_batch_size(tensor, batch_size):
    return output
 def resize_list_to_batch_size(l, batch_size):
    in_batch_size = len(l)
    if in_batch_size == batch_size or in_batch_size == 0:
        return l
    if batch_size <= 1:
        return l[:batch_size]
    output = []
    if batch_size < in_batch_size:
        scale = (in_batch_size - 1) / (batch_size - 1)
        for i in range(batch_size):
            output.append(l[min(round(i * scale), in_batch_size - 1)])
    else:
        scale = in_batch_size / batch_size
        for i in range(batch_size):
           output.append(l[min(math.floor((i + 0.5) * scale), in_batch_size - 1)])
    return output
 def convert_sd_to(state_dict, dtype):
    keys = list(state_dict.keys())
    for k in keys:
--- a/comfy/weight_adapter/init.py
+++ b/comfy/weight_adapter/init.py
@ -15,9 +15,20 @@ adapters: list[type[WeightAdapterBase]] = [
    OFTAdapter,
    BOFTAdapter,
 ]
 adapter_maps: dict[str, type[WeightAdapterBase]] = {
    "LoRA": LoRAAdapter,
    "LoHa": LoHaAdapter,
    "LoKr": LoKrAdapter,
    "OFT": OFTAdapter,
    ## We disable not implemented algo for now
    # "GLoRA": GLoRAAdapter,
    # "BOFT": BOFTAdapter,
 }
 __all__ = [
    "WeightAdapterBase",
    "WeightAdapterTrainBase",
-    "adapters"
+    "adapters",
    "adapter_maps",
 ] + [a.__name__ for a in adapters]
--- a/comfy/weight_adapter/base.py
+++ b/comfy/weight_adapter/base.py
@ -133,3 +133,43 @@ def tucker_weight_from_conv(up, down, mid):
 def tucker_weight(wa, wb, t):
    temp = torch.einsum("i j ..., j r -> i r ...", t, wb)
    return torch.einsum("i j ..., i r -> r j ...", temp, wa)
 def factorization(dimension: int, factor: int = -1) -> tuple[int, int]:
    """
    return a tuple of two value of input dimension decomposed by the number closest to factor
    second value is higher or equal than first value.
    examples)
    factor
        -1               2                4               8               16               ...
    127 -> 1, 127   127 -> 1, 127    127 -> 1, 127   127 -> 1, 127   127 -> 1, 127
    128 -> 8, 16    128 -> 2, 64     128 -> 4, 32    128 -> 8, 16    128 -> 8, 16
    250 -> 10, 25   250 -> 2, 125    250 -> 2, 125   250 -> 5, 50    250 -> 10, 25
    360 -> 8, 45    360 -> 2, 180    360 -> 4, 90    360 -> 8, 45    360 -> 12, 30
    512 -> 16, 32   512 -> 2, 256    512 -> 4, 128   512 -> 8, 64    512 -> 16, 32
    1024 -> 32, 32  1024 -> 2, 512   1024 -> 4, 256  1024 -> 8, 128  1024 -> 16, 64
    """
    if factor > 0 and (dimension % factor) == 0 and dimension >= factor**2:
        m = factor
        n = dimension // factor
        if m > n:
            n, m = m, n
        return m, n
    if factor < 0:
        factor = dimension
    m, n = 1, dimension
    length = m + n
    while m < n:
        new_m = m + 1
        while dimension % new_m != 0:
            new_m += 1
        new_n = dimension // new_m
        if new_m + new_n > length or new_m > factor:
            break
        else:
            m, n = new_m, new_n
    if m > n:
        n, m = m, n
    return m, n
--- a/comfy/weight_adapter/loha.py
+++ b/comfy/weight_adapter/loha.py
@ -3,7 +3,120 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose
 class HadaWeight(torch.autograd.Function):
    @staticmethod
    def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)):
        ctx.save_for_backward(w1d, w1u, w2d, w2u, scale)
        diff_weight = ((w1u @ w1d) * (w2u @ w2d)) * scale
        return diff_weight
    @staticmethod
    def backward(ctx, grad_out):
        (w1d, w1u, w2d, w2u, scale) = ctx.saved_tensors
        grad_out = grad_out * scale
        temp = grad_out * (w2u @ w2d)
        grad_w1u = temp @ w1d.T
        grad_w1d = w1u.T @ temp
        temp = grad_out * (w1u @ w1d)
        grad_w2u = temp @ w2d.T
        grad_w2d = w2u.T @ temp
        del temp
        return grad_w1u, grad_w1d, grad_w2u, grad_w2d, None
 class HadaWeightTucker(torch.autograd.Function):
    @staticmethod
    def forward(ctx, t1, w1u, w1d, t2, w2u, w2d, scale=torch.tensor(1)):
        ctx.save_for_backward(t1, w1d, w1u, t2, w2d, w2u, scale)
        rebuild1 = torch.einsum("i j ..., j r, i p -> p r ...", t1, w1d, w1u)
        rebuild2 = torch.einsum("i j ..., j r, i p -> p r ...", t2, w2d, w2u)
        return rebuild1 * rebuild2 * scale
    @staticmethod
    def backward(ctx, grad_out):
        (t1, w1d, w1u, t2, w2d, w2u, scale) = ctx.saved_tensors
        grad_out = grad_out * scale
        temp = torch.einsum("i j ..., j r -> i r ...", t2, w2d)
        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w2u)
        grad_w = rebuild * grad_out
        del rebuild
        grad_w1u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w1u.T)
        del grad_w, temp
        grad_w1d = torch.einsum("i r ..., i j ... -> r j", t1, grad_temp)
        grad_t1 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w1d.T)
        del grad_temp
        temp = torch.einsum("i j ..., j r -> i r ...", t1, w1d)
        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w1u)
        grad_w = rebuild * grad_out
        del rebuild
        grad_w2u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w2u.T)
        del grad_w, temp
        grad_w2d = torch.einsum("i r ..., i j ... -> r j", t2, grad_temp)
        grad_t2 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w2d.T)
        del grad_temp
        return grad_t1, grad_w1u, grad_w1d, grad_t2, grad_w2u, grad_w2d, None
 class LohaDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        # Unpack weights tuple from LoHaAdapter
        w1a, w1b, alpha, w2a, w2b, t1, t2, _ = weights
        # Create trainable parameters
        self.hada_w1_a = torch.nn.Parameter(w1a)
        self.hada_w1_b = torch.nn.Parameter(w1b)
        self.hada_w2_a = torch.nn.Parameter(w2a)
        self.hada_w2_b = torch.nn.Parameter(w2b)
        self.use_tucker = False
        if t1 is not None and t2 is not None:
            self.use_tucker = True
            self.hada_t1 = torch.nn.Parameter(t1)
            self.hada_t2 = torch.nn.Parameter(t2)
        else:
            # Keep the attributes for consistent access
            self.hada_t1 = None
            self.hada_t2 = None
        # Store rank and non-trainable alpha
        self.rank = w1b.shape[0]
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    def __call__(self, w):
        org_dtype = w.dtype
        scale = self.alpha / self.rank
        if self.use_tucker:
            diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale)
        else:
            diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale)
        # Add the scaled difference to the original weight
        weight = w.to(diff_weight) + diff_weight.reshape(w.shape)
        return weight.to(org_dtype)
    def passive_memory_usage(self):
        """Calculates memory usage of the trainable parameters."""
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class LoHaAdapter(WeightAdapterBase):
@ -13,6 +126,25 @@ class LoHaAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        in_dim = weight.shape[1:].numel()
        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
        torch.nn.init.normal_(mat1, 0.1)
        torch.nn.init.constant_(mat2, 0.0)
        mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
        torch.nn.init.normal_(mat3, 0.1)
        torch.nn.init.normal_(mat4, 0.01)
        return LohaDiff(
            (mat1, mat2, alpha, mat3, mat4, None, None, None)
        )
    def to_train(self):
        return LohaDiff(self.weights)
    @classmethod
    def load(
        cls,
--- a/comfy/weight_adapter/lokr.py
+++ b/comfy/weight_adapter/lokr.py
@ -3,7 +3,77 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import (
    WeightAdapterBase,
    WeightAdapterTrainBase,
    weight_decompose,
    factorization,
 )
 class LokrDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights
        self.use_tucker = False
        if lokr_w1_a is not None:
            _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1]
            rank_a, _ = lokr_w1_b.shape[0], lokr_w1_b.shape[1]
            self.lokr_w1_a = torch.nn.Parameter(lokr_w1_a)
            self.lokr_w1_b = torch.nn.Parameter(lokr_w1_b)
            self.w1_rebuild = True
            self.ranka = rank_a
        if lokr_w2_a is not None:
            _, rank_b = lokr_w2_a.shape[0], lokr_w2_a.shape[1]
            rank_b, _ = lokr_w2_b.shape[0], lokr_w2_b.shape[1]
            self.lokr_w2_a = torch.nn.Parameter(lokr_w2_a)
            self.lokr_w2_b = torch.nn.Parameter(lokr_w2_b)
            if lokr_t2 is not None:
                self.use_tucker = True
                self.lokr_t2 = torch.nn.Parameter(lokr_t2)
            self.w2_rebuild = True
            self.rankb = rank_b
        if lokr_w1 is not None:
            self.lokr_w1 = torch.nn.Parameter(lokr_w1)
            self.w1_rebuild = False
        if lokr_w2 is not None:
            self.lokr_w2 = torch.nn.Parameter(lokr_w2)
            self.w2_rebuild = False
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    @property
    def w1(self):
        if self.w1_rebuild:
            return (self.lokr_w1_a @ self.lokr_w1_b) * (self.alpha / self.ranka)
        else:
            return self.lokr_w1
    @property
    def w2(self):
        if self.w2_rebuild:
            if self.use_tucker:
                w2 = torch.einsum(
                    'i j k l, j r, i p -> p r k l',
                    self.lokr_t2,
                    self.lokr_w2_b,
                    self.lokr_w2_a
                )
            else:
                w2 = self.lokr_w2_a @ self.lokr_w2_b
            return w2 * (self.alpha / self.rankb)
        else:
            return self.lokr_w2
    def __call__(self, w):
        diff = torch.kron(self.w1, self.w2)
        return w + diff.reshape(w.shape).to(w)
    def passive_memory_usage(self):
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class LoKrAdapter(WeightAdapterBase):
@ -13,6 +83,20 @@ class LoKrAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        in_dim = weight.shape[1:].numel()
        out1, out2 = factorization(out_dim, rank)
        in1, in2 = factorization(in_dim, rank)
        mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype)
        mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype)
        torch.nn.init.kaiming_uniform_(mat2, a=5**0.5)
        torch.nn.init.constant_(mat1, 0.0)
        return LokrDiff(
            (mat1, mat2, alpha, None, None, None, None, None, None)
        )
    @classmethod
    def load(
        cls,
--- a/comfy/weight_adapter/oft.py
+++ b/comfy/weight_adapter/oft.py
@ -3,7 +3,58 @@ from typing import Optional
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization
 class OFTDiff(WeightAdapterTrainBase):
    def __init__(self, weights):
        super().__init__()
        # Unpack weights tuple from LoHaAdapter
        blocks, rescale, alpha, _ = weights
        # Create trainable parameters
        self.oft_blocks = torch.nn.Parameter(blocks)
        if rescale is not None:
            self.rescale = torch.nn.Parameter(rescale)
            self.rescaled = True
        else:
            self.rescaled = False
        self.block_num, self.block_size, _ = blocks.shape
        self.constraint = float(alpha)
        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
    def __call__(self, w):
        org_dtype = w.dtype
        I = torch.eye(self.block_size, device=self.oft_blocks.device)
        ## generate r
        # for Q = -Q^T
        q = self.oft_blocks - self.oft_blocks.transpose(1, 2)
        normed_q = q
        if self.constraint:
            q_norm = torch.norm(q) + 1e-8
            if q_norm > self.constraint:
                normed_q = q * self.constraint / q_norm
        # use float() to prevent unsupported type
        r = (I + normed_q) @ (I - normed_q).float().inverse()
        ## Apply chunked matmul on weight
        _, *shape = w.shape
        org_weight = w.to(dtype=r.dtype)
        org_weight = org_weight.unflatten(0, (self.block_num, self.block_size))
        # Init R=0, so add I on it to ensure the output of step0 is original model output
        weight = torch.einsum(
            "k n m, k n ... -> k m ...",
            r,
            org_weight,
        ).flatten(0, 1)
        if self.rescaled:
            weight = self.rescale * weight
        return weight.to(org_dtype)
    def passive_memory_usage(self):
        """Calculates memory usage of the trainable parameters."""
        return sum(param.numel() * param.element_size() for param in self.parameters())
 class OFTAdapter(WeightAdapterBase):
@ -13,6 +64,18 @@ class OFTAdapter(WeightAdapterBase):
        self.loaded_keys = loaded_keys
        self.weights = weights
    @classmethod
    def create_train(cls, weight, rank=1, alpha=1.0):
        out_dim = weight.shape[0]
        block_size, block_num = factorization(out_dim, rank)
        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype)
        return OFTDiff(
            (block, None, alpha, None)
        )
    def to_train(self):
        return OFTDiff(self.weights)
    @classmethod
    def load(
        cls,
@ -60,6 +123,8 @@ class OFTAdapter(WeightAdapterBase):
        blocks = v[0]
        rescale = v[1]
        alpha = v[2]
        if alpha is None:
            alpha = 0
        dora_scale = v[3]
        blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype)
--- a/comfy_api_nodes/README.md
+++ b/comfy_api_nodes/README.md
@ -2,7 +2,7 @@
 ## Introduction 
-Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview#api-nodes).
+Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview).
 ## Development
--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@ -2,7 +2,11 @@ import logging
 from typing import Any, Callable, Optional, TypeVar
 import random
 import torch
-from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions
+from comfy_api_nodes.util.validation_utils import (
    get_image_dimensions,
    validate_image_dimensions,
    validate_video_dimensions,
 )
 from comfy_api_nodes.apis import (
@ -10,7 +14,7 @@ from comfy_api_nodes.apis import (
    MoonvalleyTextToVideoInferenceParams,
    MoonvalleyVideoToVideoInferenceParams,
    MoonvalleyVideoToVideoRequest,
-    MoonvalleyPromptResponse
+    MoonvalleyPromptResponse,
 )
 from comfy_api_nodes.apis.client import (
    ApiEndpoint,
@ -54,20 +58,26 @@ MAX_VIDEO_SIZE = 1024 * 1024 * 1024  # 1 GB max for in-memory video processing
 MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000
 R = TypeVar("R")
 class MoonvalleyApiError(Exception):
    """Base exception for Moonvalley API errors."""
    pass
 def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool:
    """Verifies that the initial response contains a task ID."""
    return bool(response.id)
 def validate_task_creation_response(response) -> None:
    if not is_valid_task_creation_response(response):
        error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}"
        logging.error(error_msg)
        raise MoonvalleyApiError(error_msg)
 def get_video_from_response(response):
    video = response.output_url
    logging.info(
@ -102,16 +112,17 @@ def poll_until_finished(
        poll_interval=16.0,
        failed_statuses=["error"],
        status_extractor=lambda response: (
-            response.status
+            response.status if response and response.status else None
            if response and response.status
            else None
        ),
        auth_kwargs=auth_kwargs,
        result_url_extractor=result_url_extractor,
        node_id=node_id,
    ).execute()
-def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH):
+
 def validate_prompts(
    prompt: str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH
 ):
    """Verifies that the prompt isn't empty and that neither prompt is too long."""
    if not prompt:
        raise ValueError("Positive prompt is empty")
@ -123,16 +134,15 @@ def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAR
        )
    return True
 def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None):
-        # inference validation
+    # inference validation
-        # T = num_frames
+    # T = num_frames
-        # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition...
+    # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition...
-        # with image conditioning: H*W must be divisible by 8192
+    # with image conditioning: H*W must be divisible by 8192
-        # without image conditioning: T divisible by 32
+    # without image conditioning: T divisible by 32
-    if num_frames_in and not num_frames_in % 16 == 0 :
+    if num_frames_in and not num_frames_in % 16 == 0:
-        return False, (
+        return False, ("The input video total frame count must be divisible by 16!")
            "The input video total frame count must be divisible by 16!"
        )
    if height % 8 != 0 or width % 8 != 0:
        return False, (
@ -146,13 +156,13 @@ def validate_input_media(width, height, with_frame_conditioning, num_frames_in=N
                "divisible by 8192 for frame conditioning"
            )
    else:
-        if num_frames_in and not num_frames_in % 32 == 0 :
+        if num_frames_in and not num_frames_in % 32 == 0:
-            return False, (
+            return False, ("The input video total frame count must be divisible by 32!")
                "The input video total frame count must be divisible by 32!"
            )
-def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None:
+def validate_input_image(
    image: torch.Tensor, with_frame_conditioning: bool = False
 ) -> None:
    """
    Validates the input image adheres to the expectations of the API:
    - The image resolution should not be less than 300*300px
@ -160,10 +170,15 @@ def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=Fals
    """
    height, width = get_image_dimensions(image)
-    validate_input_media(width, height, with_frame_conditioning )
+    validate_input_media(width, height, with_frame_conditioning)
-    validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH)
+    validate_image_dimensions(
        image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH
    )
-def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False):
+
 def validate_input_video(
    video: VideoInput, num_frames_out: int, with_frame_conditioning: bool = False
 ):
    try:
        width, height = video.get_dimensions()
    except Exception as e:
@ -171,7 +186,13 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond
        raise ValueError(f"Cannot get video dimensions: {e}") from e
    validate_input_media(width, height, with_frame_conditioning)
-    validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT)
+    validate_video_dimensions(
        video,
        min_width=MIN_VID_WIDTH,
        min_height=MIN_VID_HEIGHT,
        max_width=MAX_VID_WIDTH,
        max_height=MAX_VID_HEIGHT,
    )
    trimmed_video = validate_input_video_length(video, num_frames_out)
    return trimmed_video
@ -180,22 +201,29 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond
 def validate_input_video_length(video: VideoInput, num_frames: int):
    if video.get_duration() > 60:
-        raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.")
+        raise MoonvalleyApiError(
            "Input Video lenth should be less than 1min. Please trim."
        )
    if num_frames == 128:
-       if video.get_duration() < 5:
+        if video.get_duration() < 5:
-           raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.")
+            raise MoonvalleyApiError(
-       if video.get_duration() > 5:
+                "Input Video length is less than 5s. Please use a video longer than or equal to 5s."
-        #    trim video to 5s
+            )
-        video = trim_video(video, 5)
+        if video.get_duration() > 5:
            #    trim video to 5s
            video = trim_video(video, 5)
    if num_frames == 256:
        if video.get_duration() < 10:
-            raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.")
+            raise MoonvalleyApiError(
                "Input Video length is less than 10s. Please use a video longer than or equal to 10s."
            )
        if video.get_duration() > 10:
            # trim video to 10s
            video = trim_video(video, 10)
    return video
 def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
    """
    Returns a new VideoInput object trimmed from the beginning to the specified duration,
@ -219,8 +247,8 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
        input_source = video.get_stream_source()
        # Open containers
-        input_container = av.open(input_source, mode='r')
+        input_container = av.open(input_source, mode="r")
-        output_container = av.open(output_buffer, mode='w', format='mp4')
+        output_container = av.open(output_buffer, mode="w", format="mp4")
        # Set up output streams for re-encoding
        video_stream = None
@ -230,22 +258,32 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
            logging.info(f"Found stream: type={stream.type}, class={type(stream)}")
            if isinstance(stream, av.VideoStream):
                # Create output video stream with same parameters
-                video_stream = output_container.add_stream('h264', rate=stream.average_rate)
+                video_stream = output_container.add_stream(
                    "h264", rate=stream.average_rate
                )
                video_stream.width = stream.width
                video_stream.height = stream.height
-                video_stream.pix_fmt = 'yuv420p'
+                video_stream.pix_fmt = "yuv420p"
-                logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps")
+                logging.info(
                    f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps"
                )
            elif isinstance(stream, av.AudioStream):
                # Create output audio stream with same parameters
-                audio_stream = output_container.add_stream('aac', rate=stream.sample_rate)
+                audio_stream = output_container.add_stream(
                    "aac", rate=stream.sample_rate
                )
                audio_stream.sample_rate = stream.sample_rate
                audio_stream.layout = stream.layout
-                logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels")
+                logging.info(
                    f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels"
                )
        # Calculate target frame count that's divisible by 32
        fps = input_container.streams.video[0].average_rate
        estimated_frames = int(duration_sec * fps)
-        target_frames = (estimated_frames // 32) * 32  # Round down to nearest multiple of 32
+        target_frames = (
            estimated_frames // 32
        ) * 32  # Round down to nearest multiple of 32
        if target_frames == 0:
            raise ValueError("Video too short: need at least 32 frames for Moonvalley")
@ -268,7 +306,9 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
            for packet in video_stream.encode():
                output_container.mux(packet)
-            logging.info(f"Encoded {frame_count} video frames (target: {target_frames})")
+            logging.info(
                f"Encoded {frame_count} video frames (target: {target_frames})"
            )
        # Decode and re-encode audio frames
        if audio_stream:
@ -292,7 +332,6 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
        output_container.close()
        input_container.close()
        # Return as VideoFromFile using the buffer
        output_buffer.seek(0)
        return VideoFromFile(output_buffer)
@ -305,6 +344,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
            output_container.close()
        raise RuntimeError(f"Failed to trim video: {str(e)}") from e
 # --- BaseMoonvalleyVideoNode ---
 class BaseMoonvalleyVideoNode:
    def parseWidthHeightFromRes(self, resolution: str):
@ -328,7 +368,7 @@ class BaseMoonvalleyVideoNode:
            "Motion Transfer": "motion_control",
            "Canny": "canny_control",
            "Pose Transfer": "pose_control",
-            "Depth": "depth_control"
+            "Depth": "depth_control",
        }
        if value in control_map:
            return control_map[value]
@ -355,31 +395,63 @@ class BaseMoonvalleyVideoNode:
        return {
            "required": {
                "prompt": model_field_to_node_input(
-                    IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text",
+                    IO.STRING,
-                    multiline=True
+                    MoonvalleyTextToVideoRequest,
                    "prompt_text",
                    multiline=True,
                ),
                "negative_prompt": model_field_to_node_input(
                    IO.STRING,
                    MoonvalleyTextToVideoInferenceParams,
                    "negative_prompt",
                    multiline=True,
-                    default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static",
+                    default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts",
                ),
-
+                "resolution": (
-                "resolution": (IO.COMBO, {
+                    IO.COMBO,
-                        "options": ["16:9 (1920 x 1080)",
+                    {
-                                    "9:16 (1080 x 1920)",
+                        "options": [
-                                    "1:1 (1152 x 1152)",
+                            "16:9 (1920 x 1080)",
-                                    "4:3 (1440 x 1080)",
+                            "9:16 (1080 x 1920)",
-                                    "3:4 (1080 x 1440)",
+                            "1:1 (1152 x 1152)",
-                                    "21:9 (2560 x 1080)"],
+                            "4:3 (1440 x 1080)",
                            "3:4 (1080 x 1440)",
                            "21:9 (2560 x 1080)",
                        ],
                        "default": "16:9 (1920 x 1080)",
                        "tooltip": "Resolution of the output video",
-                    }),
+                    },
                ),
                # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}),
-                "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20),
+                "prompt_adherence": model_field_to_node_input(
-                "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True),
+                    IO.FLOAT,
-                "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100),
+                    MoonvalleyTextToVideoInferenceParams,
                    "guidance_scale",
                    default=7.0,
                    step=1,
                    min=1,
                    max=20,
                ),
                "seed": model_field_to_node_input(
                    IO.INT,
                    MoonvalleyTextToVideoInferenceParams,
                    "seed",
                    default=random.randint(0, 2**32 - 1),
                    min=0,
                    max=4294967295,
                    step=1,
                    display="number",
                    tooltip="Random seed value",
                    control_after_generate=True,
                ),
                "steps": model_field_to_node_input(
                    IO.INT,
                    MoonvalleyTextToVideoInferenceParams,
                    "steps",
                    default=100,
                    min=1,
                    max=100,
                ),
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
@ -393,7 +465,7 @@ class BaseMoonvalleyVideoNode:
                    "image_url",
                    tooltip="The reference image used to generate the video",
                ),
-            }
+            },
        }
    RETURN_TYPES = ("STRING",)
@ -404,6 +476,7 @@ class BaseMoonvalleyVideoNode:
    def generate(self, **kwargs):
        return None
 # --- MoonvalleyImg2VideoNode ---
 class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode):
@ -415,43 +488,46 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode):
    RETURN_NAMES = ("video",)
    DESCRIPTION = "Moonvalley Marey Image to Video Node"
-    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+    def generate(
        self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs
    ):
        image = kwargs.get("image", None)
-        if (image is None):
+        if image is None:
            raise MoonvalleyApiError("image is required")
        total_frames = get_total_frames_from_length()
-        validate_input_image(image,True)
+        validate_input_image(image, True)
        validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
        width_height = self.parseWidthHeightFromRes(kwargs.get("resolution"))
-        inference_params=MoonvalleyTextToVideoInferenceParams(
+        inference_params = MoonvalleyTextToVideoInferenceParams(
-                    negative_prompt=negative_prompt,
+            negative_prompt=negative_prompt,
-                    steps=kwargs.get("steps"),
+            steps=kwargs.get("steps"),
-                    seed=kwargs.get("seed"),
+            seed=kwargs.get("seed"),
-                    guidance_scale=kwargs.get("prompt_adherence"),
+            guidance_scale=kwargs.get("prompt_adherence"),
-                    num_frames=total_frames,
+            num_frames=total_frames,
-                    width=width_height.get("width"),
+            width=width_height.get("width"),
-                    height=width_height.get("height"),
+            height=width_height.get("height"),
-                    use_negative_prompts=True
+            use_negative_prompts=True,
-                )
+        )
        """Upload image to comfy backend to have a URL available for further processing"""
        # Get MIME type from tensor - assuming PNG format for image tensors
        mime_type = "image/png"
-        image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0]
+        image_url = upload_images_to_comfyapi(
            image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type
        )[0]
        request = MoonvalleyTextToVideoRequest(
-                image_url=image_url,
+            image_url=image_url, prompt_text=prompt, inference_params=inference_params
-                prompt_text=prompt,
+        )
                inference_params=inference_params
            )
        initial_operation = SynchronousOperation(
-            endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT,
+            endpoint=ApiEndpoint(
-                                 method=HttpMethod.POST,
+                path=API_IMG2VIDEO_ENDPOINT,
-                                 request_model=MoonvalleyTextToVideoRequest,
+                method=HttpMethod.POST,
-                                 response_model=MoonvalleyPromptResponse
+                request_model=MoonvalleyTextToVideoRequest,
-                                 ),
+                response_model=MoonvalleyPromptResponse,
            ),
            request=request,
            auth_kwargs=kwargs,
        )
@ -463,7 +539,8 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode):
            task_id, auth_kwargs=kwargs, node_id=unique_id
        )
        video = download_url_to_video_output(final_response.output_url)
-        return (video, )
+        return (video,)
 # --- MoonvalleyVid2VidNode ---
 class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode):
@ -479,38 +556,46 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode):
            if param in input_types["optional"]:
                del input_types["optional"][param]
        input_types["optional"] = {
-                "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}),
+            "video": (
-                "control_type": (
+                IO.VIDEO,
-                    ["Motion Transfer", "Pose Transfer"],
+                {
-                    {"default": "Motion Transfer"},
+                    "default": "",
-                ),
+                    "multiline": False,
-                "motion_intensity": (
+                    "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically.",
-                    "INT",
+                },
-                    {
+            ),
-                        "default": 100,
+            "control_type": (
-                        "step": 1,
+                ["Motion Transfer", "Pose Transfer"],
-                        "min": 0,
+                {"default": "Motion Transfer"},
-                        "max": 100,
+            ),
-                        "tooltip": "Only used if control_type is 'Motion Transfer'",
+            "motion_intensity": (
-                    },
+                "INT",
-                )
+                {
-            }
+                    "default": 100,
                    "step": 1,
                    "min": 0,
                    "max": 100,
                    "tooltip": "Only used if control_type is 'Motion Transfer'",
                },
            ),
        }
        return input_types
    RETURN_TYPES = ("VIDEO",)
    RETURN_NAMES = ("video",)
-    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+    def generate(
        self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs
    ):
        video = kwargs.get("video")
        num_frames = get_total_frames_from_length()
-        if not video :
+        if not video:
            raise MoonvalleyApiError("video is required")
        """Validate video input"""
-        video_url=""
+        video_url = ""
        if video:
            validated_video = validate_input_video(video, num_frames, False)
            video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs)
@ -520,29 +605,30 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode):
        """Validate prompts and inference input"""
        validate_prompts(prompt, negative_prompt)
-        inference_params=MoonvalleyVideoToVideoInferenceParams(
+        inference_params = MoonvalleyVideoToVideoInferenceParams(
            negative_prompt=negative_prompt,
            steps=kwargs.get("steps"),
            seed=kwargs.get("seed"),
            guidance_scale=kwargs.get("prompt_adherence"),
-            control_params={'motion_intensity': motion_intensity}
+            control_params={"motion_intensity": motion_intensity},
        )
        control = self.parseControlParameter(control_type)
        request = MoonvalleyVideoToVideoRequest(
-                control_type=control,
+            control_type=control,
-                video_url=video_url,
+            video_url=video_url,
-                prompt_text=prompt,
+            prompt_text=prompt,
-                inference_params=inference_params
+            inference_params=inference_params,
-            )
+        )
        initial_operation = SynchronousOperation(
-            endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT,
+            endpoint=ApiEndpoint(
-                                 method=HttpMethod.POST,
+                path=API_VIDEO2VIDEO_ENDPOINT,
-                                 request_model=MoonvalleyVideoToVideoRequest,
+                method=HttpMethod.POST,
-                                 response_model=MoonvalleyPromptResponse
+                request_model=MoonvalleyVideoToVideoRequest,
-                                 ),
+                response_model=MoonvalleyPromptResponse,
            ),
            request=request,
            auth_kwargs=kwargs,
        )
@ -556,7 +642,8 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode):
        video = download_url_to_video_output(final_response.output_url)
-        return (video, )
+        return (video,)
 # --- MoonvalleyTxt2VideoNode ---
 class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode):
@ -575,31 +662,33 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode):
                del input_types["optional"][param]
        return input_types
-    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+    def generate(
        self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs
    ):
        validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
        width_height = self.parseWidthHeightFromRes(kwargs.get("resolution"))
        num_frames = get_total_frames_from_length()
-        inference_params=MoonvalleyTextToVideoInferenceParams(
+        inference_params = MoonvalleyTextToVideoInferenceParams(
-                    negative_prompt=negative_prompt,
+            negative_prompt=negative_prompt,
-                    steps=kwargs.get("steps"),
+            steps=kwargs.get("steps"),
-                    seed=kwargs.get("seed"),
+            seed=kwargs.get("seed"),
-                    guidance_scale=kwargs.get("prompt_adherence"),
+            guidance_scale=kwargs.get("prompt_adherence"),
-                    num_frames=num_frames,
+            num_frames=num_frames,
-                    width=width_height.get("width"),
+            width=width_height.get("width"),
-                    height=width_height.get("height"),
+            height=width_height.get("height"),
-                )
+        )
        request = MoonvalleyTextToVideoRequest(
-                prompt_text=prompt,
+            prompt_text=prompt, inference_params=inference_params
-                inference_params=inference_params
+        )
            )
        initial_operation = SynchronousOperation(
-            endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT,
+            endpoint=ApiEndpoint(
-                                 method=HttpMethod.POST,
+                path=API_TXT2VIDEO_ENDPOINT,
-                                 request_model=MoonvalleyTextToVideoRequest,
+                method=HttpMethod.POST,
-                                 response_model=MoonvalleyPromptResponse
+                request_model=MoonvalleyTextToVideoRequest,
-                                 ),
+                response_model=MoonvalleyPromptResponse,
            ),
            request=request,
            auth_kwargs=kwargs,
        )
@ -612,8 +701,7 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode):
        )
        video = download_url_to_video_output(final_response.output_url)
-        return (video, )
+        return (video,)
 NODE_CLASS_MAPPINGS = {
@ -629,6 +717,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video",
 }
 def get_total_frames_from_length(length="5s"):
    # if length == '5s':
    #     return 128
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@ -20,7 +20,7 @@ import folder_paths
 import node_helpers
 from comfy.cli_args import args
 from comfy.comfy_types.node_typing import IO
-from comfy.weight_adapter import adapters
+from comfy.weight_adapter import adapters, adapter_maps
 def make_batch_extra_option_dict(d, indicies, full_size=None):
@ -39,13 +39,13 @@ def make_batch_extra_option_dict(d, indicies, full_size=None):
 class TrainSampler(comfy.samplers.Sampler):
-
+    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.loss_callback = loss_callback
        self.batch_size = batch_size
        self.total_steps = total_steps
        self.grad_acc = grad_acc
        self.seed = seed
        self.training_dtype = training_dtype
@ -92,8 +92,9 @@ class TrainSampler(comfy.samplers.Sampler):
                self.loss_callback(loss.item())
            pbar.set_postfix({"loss": f"{loss.item():.4f}"})
-            self.optimizer.step()
+            if (i+1) % self.grad_acc == 0:
-            self.optimizer.zero_grad()
+                self.optimizer.step()
                self.optimizer.zero_grad()
        torch.cuda.empty_cache()
        return torch.zeros_like(latent_image)
@ -419,6 +420,16 @@ class TrainLoraNode:
                        "tooltip": "The batch size to use for training.",
                    },
                ),
                "grad_accumulation_steps": (
                    IO.INT,
                    {
                        "default": 1,
                        "min": 1,
                        "max": 1024,
                        "step": 1,
                        "tooltip": "The number of gradient accumulation steps to use for training.",
                    }
                ),
                "steps": (
                    IO.INT,
                    {
@ -478,6 +489,17 @@ class TrainLoraNode:
                    ["bf16", "fp32"],
                    {"default": "bf16", "tooltip": "The dtype to use for lora."},
                ),
                "algorithm": (
                    list(adapter_maps.keys()),
                    {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."},
                ),
                "gradient_checkpointing": (
                    IO.BOOLEAN,
                    {
                        "default": True,
                        "tooltip": "Use gradient checkpointing for training.",
                    }
                ),
                "existing_lora": (
                    folder_paths.get_filename_list("loras") + ["[None]"],
                    {
@ -501,6 +523,7 @@ class TrainLoraNode:
        positive,
        batch_size,
        steps,
        grad_accumulation_steps,
        learning_rate,
        rank,
        optimizer,
@ -508,6 +531,8 @@ class TrainLoraNode:
        seed,
        training_dtype,
        lora_dtype,
        algorithm,
        gradient_checkpointing,
        existing_lora,
    ):
        mp = model.clone()
@ -558,10 +583,8 @@ class TrainLoraNode:
                                if existing_adapter is not None:
                                    break
                            else:
                                # If no existing adapter found, use LoRA
                                # We will add algo option in the future
                                existing_adapter = None
-                                adapter_cls = adapters[0]
+                                adapter_cls = adapter_maps[algorithm]
                            if existing_adapter is not None:
                                train_adapter = existing_adapter.to_train().to(lora_dtype)
@ -615,8 +638,9 @@ class TrainLoraNode:
                criterion = torch.nn.SmoothL1Loss()
            # setup models
-            for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
+            if gradient_checkpointing:
-                patch(m)
+                for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
                    patch(m)
            mp.model.requires_grad_(False)
            comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True)
@ -629,7 +653,8 @@ class TrainLoraNode:
                optimizer,
                loss_callback=loss_callback,
                batch_size=batch_size,
-                total_steps=steps,
+                grad_acc=grad_accumulation_steps,
                total_steps=steps*grad_accumulation_steps,
                seed=seed,
                training_dtype=dtype
            )
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -1,3 +1,4 @@
 import math
 import nodes
 import node_helpers
 import torch
@ -5,7 +6,9 @@ import comfy.model_management
 import comfy.utils
 import comfy.latent_formats
 import comfy.clip_vision
-
+import json
 import numpy as np
 from typing import Tuple
 class WanImageToVideo:
    @classmethod
@ -383,7 +386,307 @@ class WanPhantomSubjectToVideo:
        out_latent["samples"] = latent
        return (positive, cond2, negative, out_latent)
 def parse_json_tracks(tracks):
    """Parse JSON track data into a standardized format"""
    tracks_data = []
    try:
        # If tracks is a string, try to parse it as JSON
        if isinstance(tracks, str):
            parsed = json.loads(tracks.replace("'", '"'))
            tracks_data.extend(parsed)
        else:
            # If tracks is a list of strings, parse each one
            for track_str in tracks:
                parsed = json.loads(track_str.replace("'", '"'))
                tracks_data.append(parsed)
        # Check if we have a single track (dict with x,y) or a list of tracks
        if tracks_data and isinstance(tracks_data[0], dict) and 'x' in tracks_data[0]:
            # Single track detected, wrap it in a list
            tracks_data = [tracks_data]
        elif tracks_data and isinstance(tracks_data[0], list) and tracks_data[0] and isinstance(tracks_data[0][0], dict) and 'x' in tracks_data[0][0]:
            # Already a list of tracks, nothing to do
            pass
        else:
            # Unexpected format
            pass
    except json.JSONDecodeError:
        tracks_data = []
    return tracks_data
 def process_tracks(tracks_np: np.ndarray, frame_size: Tuple[int, int], num_frames, quant_multi: int = 8, **kwargs):
    # tracks: shape [t, h, w, 3] => samples align with 24 fps, model trained with 16 fps.
    # frame_size: tuple (W, H)
    tracks = torch.from_numpy(tracks_np).float()
    if tracks.shape[1] == 121:
        tracks = torch.permute(tracks, (1, 0, 2, 3))
    tracks, visibles = tracks[..., :2], tracks[..., 2:3]
    short_edge = min(*frame_size)
    frame_center = torch.tensor([*frame_size]).type_as(tracks) / 2
    tracks = tracks - frame_center
    tracks = tracks / short_edge * 2
    visibles = visibles * 2 - 1
    trange = torch.linspace(-1, 1, tracks.shape[0]).view(-1, 1, 1, 1).expand(*visibles.shape)
    out_ = torch.cat([trange, tracks, visibles], dim=-1).view(121, -1, 4)
    out_0 = out_[:1]
    out_l = out_[1:] # 121 => 120 | 1
    a = 120 // math.gcd(120, num_frames)
    b = num_frames // math.gcd(120, num_frames)
    out_l = torch.repeat_interleave(out_l, b, dim=0)[1::a]  # 120 => 120 * b => 120 * b / a == F
    final_result = torch.cat([out_0, out_l], dim=0)
    return final_result
 FIXED_LENGTH = 121
 def pad_pts(tr):
    """Convert list of {x,y} to (FIXED_LENGTH,1,3) array, padding/truncating."""
    pts = np.array([[p['x'], p['y'], 1] for p in tr], dtype=np.float32)
    n = pts.shape[0]
    if n < FIXED_LENGTH:
        pad = np.zeros((FIXED_LENGTH - n, 3), dtype=np.float32)
        pts = np.vstack((pts, pad))
    else:
        pts = pts[:FIXED_LENGTH]
    return pts.reshape(FIXED_LENGTH, 1, 3)
 def ind_sel(target: torch.Tensor, ind: torch.Tensor, dim: int = 1):
    """Index selection utility function"""
    assert (
        len(ind.shape) > dim
    ), "Index must have the target dim, but get dim: %d, ind shape: %s" % (dim, str(ind.shape))
    target = target.expand(
        *tuple(
            [ind.shape[k] if target.shape[k] == 1 else -1 for k in range(dim)]
            + [
                -1,
            ]
            * (len(target.shape) - dim)
        )
    )
    ind_pad = ind
    if len(target.shape) > dim + 1:
        for _ in range(len(target.shape) - (dim + 1)):
            ind_pad = ind_pad.unsqueeze(-1)
        ind_pad = ind_pad.expand(*(-1,) * (dim + 1), *target.shape[(dim + 1) : :])
    return torch.gather(target, dim=dim, index=ind_pad)
 def merge_final(vert_attr: torch.Tensor, weight: torch.Tensor, vert_assign: torch.Tensor):
    """Merge vertex attributes with weights"""
    target_dim = len(vert_assign.shape) - 1
    if len(vert_attr.shape) == 2:
        assert vert_attr.shape[0] > vert_assign.max()
        new_shape = [1] * target_dim + list(vert_attr.shape)
        tensor = vert_attr.reshape(new_shape)
        sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim)
    else:
        assert vert_attr.shape[1] > vert_assign.max()
        new_shape = [vert_attr.shape[0]] + [1] * (target_dim - 1) + list(vert_attr.shape[1:])
        tensor = vert_attr.reshape(new_shape)
        sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim)
    final_attr = torch.sum(sel_attr * weight.unsqueeze(-1), dim=-2)
    return final_attr
 def _patch_motion_single(
    tracks: torch.FloatTensor,  # (B, T, N, 4)
    vid: torch.FloatTensor,     # (C, T, H, W)
    temperature: float,
    vae_divide: tuple,
    topk: int,
 ):
    """Apply motion patching based on tracks"""
    _, T, H, W = vid.shape
    N = tracks.shape[2]
    _, tracks_xy, visible = torch.split(
        tracks, [1, 2, 1], dim=-1
    )  # (B, T, N, 2) | (B, T, N, 1)
    tracks_n = tracks_xy / torch.tensor([W / min(H, W), H / min(H, W)], device=tracks_xy.device)
    tracks_n = tracks_n.clamp(-1, 1)
    visible = visible.clamp(0, 1)
    xx = torch.linspace(-W / min(H, W), W / min(H, W), W)
    yy = torch.linspace(-H / min(H, W), H / min(H, W), H)
    grid = torch.stack(torch.meshgrid(yy, xx, indexing="ij")[::-1], dim=-1).to(
        tracks_xy.device
    )
    tracks_pad = tracks_xy[:, 1:]
    visible_pad = visible[:, 1:]
    visible_align = visible_pad.view(T - 1, 4, *visible_pad.shape[2:]).sum(1)
    tracks_align = (tracks_pad * visible_pad).view(T - 1, 4, *tracks_pad.shape[2:]).sum(
        1
    ) / (visible_align + 1e-5)
    dist_ = (
        (tracks_align[:, None, None] - grid[None, :, :, None]).pow(2).sum(-1)
    )  # T, H, W, N
    weight = torch.exp(-dist_ * temperature) * visible_align.clamp(0, 1).view(
        T - 1, 1, 1, N
    )
    vert_weight, vert_index = torch.topk(
        weight, k=min(topk, weight.shape[-1]), dim=-1
    )
    grid_mode = "bilinear"
    point_feature = torch.nn.functional.grid_sample(
        vid.permute(1, 0, 2, 3)[:1],
        tracks_n[:, :1].type(vid.dtype),
        mode=grid_mode,
        padding_mode="zeros",
        align_corners=False,
    )
    point_feature = point_feature.squeeze(0).squeeze(1).permute(1, 0) # N, C=16
    out_feature = merge_final(point_feature, vert_weight, vert_index).permute(3, 0, 1, 2) # T - 1, H, W, C => C, T - 1, H, W
    out_weight = vert_weight.sum(-1) # T - 1, H, W
    # out feature -> already soft weighted
    mix_feature = out_feature + vid[:, 1:] * (1 - out_weight.clamp(0, 1))
    out_feature_full = torch.cat([vid[:, :1], mix_feature], dim=1) # C, T, H, W
    out_mask_full = torch.cat([torch.ones_like(out_weight[:1]), out_weight], dim=0)  # T, H, W
    return out_mask_full[None].expand(vae_divide[0], -1, -1, -1), out_feature_full
 def patch_motion(
    tracks: torch.FloatTensor,  # (B, TB, T, N, 4)
    vid: torch.FloatTensor,     # (C, T, H, W)
    temperature: float = 220.0,
    vae_divide: tuple = (4, 16),
    topk: int = 2,
 ):
    B = len(tracks)
    # Process each batch separately
    out_masks = []
    out_features = []
    for b in range(B):
        mask, feature = _patch_motion_single(
            tracks[b],  # (T, N, 4)
            vid[b],        # (C, T, H, W)
            temperature,
            vae_divide,
            topk
        )
        out_masks.append(mask)
        out_features.append(feature)
    # Stack results: (B, C, T, H, W)
    out_mask_full = torch.stack(out_masks, dim=0)
    out_feature_full = torch.stack(out_features, dim=0)
    return out_mask_full, out_feature_full
 class WanTrackToVideo:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
                    "positive": ("CONDITIONING", ),
                    "negative": ("CONDITIONING", ),
                    "vae": ("VAE", ),
                    "tracks": ("STRING", {"multiline": True, "default": "[]"}),
                    "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
                    "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
                    "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
                    "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
                    "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}),
                    "topk": ("INT", {"default": 2, "min": 1, "max": 10}),
                    "start_image": ("IMAGE", ),
                },
                "optional": {
                    "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
                }}
    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
    RETURN_NAMES = ("positive", "negative", "latent")
    FUNCTION = "encode"
    CATEGORY = "conditioning/video_models"
    def encode(self, positive, negative, vae, tracks, width, height, length, batch_size,
               temperature, topk, start_image=None, clip_vision_output=None):
        tracks_data = parse_json_tracks(tracks)
        if not tracks_data:
            return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output)
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
                           device=comfy.model_management.intermediate_device())
        if isinstance(tracks_data[0][0], dict):
            tracks_data = [tracks_data]
        processed_tracks = []
        for batch in tracks_data:
            arrs = []
            for track in batch:
                pts = pad_pts(track)
                arrs.append(pts)
            tracks_np = np.stack(arrs, axis=0)
            processed_tracks.append(process_tracks(tracks_np, (width, height), length - 1).unsqueeze(0))
        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image[:batch_size].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            videos = torch.ones((start_image.shape[0], length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5
            for i in range(start_image.shape[0]):
                videos[i, 0] = start_image[i]
            latent_videos = []
            videos = comfy.utils.resize_to_batch_size(videos, batch_size)
            for i in range(batch_size):
                latent_videos += [vae.encode(videos[i, :, :, :, :3])]
            y = torch.cat(latent_videos, dim=0)
            # Scale latent since patch_motion is non-linear
            y = comfy.latent_formats.Wan21().process_in(y)
            processed_tracks = comfy.utils.resize_list_to_batch_size(processed_tracks, batch_size)
            res = patch_motion(
                processed_tracks, y, temperature=temperature, topk=topk, vae_divide=(4, 16)
            )
            mask, concat_latent_image = res
            concat_latent_image = comfy.latent_formats.Wan21().process_out(concat_latent_image)
            mask = -mask + 1.0  # Invert mask to match expected format
            positive = node_helpers.conditioning_set_values(positive,
                                                            {"concat_mask": mask,
                                                            "concat_latent_image": concat_latent_image})
            negative = node_helpers.conditioning_set_values(negative,
                                                            {"concat_mask": mask,
                                                            "concat_latent_image": concat_latent_image})
        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
        out_latent = {}
        out_latent["samples"] = latent
        return (positive, negative, out_latent)
 NODE_CLASS_MAPPINGS = {
    "WanTrackToVideo": WanTrackToVideo,
    "WanImageToVideo": WanImageToVideo,
    "WanFunControlToVideo": WanFunControlToVideo,
    "WanFunInpaintToVideo": WanFunInpaintToVideo,
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@ -74,7 +74,8 @@ if not args.cuda_malloc:
                module = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(module)
                version = module.__version__
-        if int(version[0]) >= 2: #enable by default for torch version 2.0 and up
+
        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
            args.cuda_malloc = cuda_malloc_supported()
    except:
        pass
--- a/execution.py
+++ b/execution.py
@ -1097,7 +1097,7 @@ class PromptQueue:
                    return True
        return False
-    def get_history(self, prompt_id=None, max_items=None, offset=-1):
+    def get_history(self, prompt_id=None, max_items=None, offset=-1, map_function=None):
        with self.mutex:
            if prompt_id is None:
                out = {}
@ -1106,13 +1106,21 @@ class PromptQueue:
                    offset = len(self.history) - max_items
                for k in self.history:
                    if i >= offset:
-                        out[k] = self.history[k]
+                        p = self.history[k]
                        if map_function is not None:
                            p = map_function(p)
                        out[k] = p
                        if max_items is not None and len(out) >= max_items:
                            break
                    i += 1
                return out
            elif prompt_id in self.history:
-                return {prompt_id: copy.deepcopy(self.history[prompt_id])}
+                p = self.history[prompt_id]
                if map_function is None:
                    p = copy.deepcopy(p)
                else:
                    p = map_function(p)
                return {prompt_id: p}
            else:
                return {}
--- a/main.py
+++ b/main.py
@ -115,6 +115,15 @@ if os.name == "nt":
    logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())
 if __name__ == "__main__":
    if args.default_device is not None:
        default_dev = args.default_device
        devices = list(range(32))
        devices.remove(default_dev)
        devices.insert(0, default_dev)
        devices = ','.join(map(str, devices))
        os.environ['CUDA_VISIBLE_DEVICES'] = str(devices)
        os.environ['HIP_VISIBLE_DEVICES'] = str(devices)
    if args.cuda_device is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device)
        os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.23.4
-comfyui-workflow-templates==0.1.39
+comfyui-workflow-templates==0.1.40
 comfyui-embedded-docs==0.2.4
 torch
 torchsde
--- a/server.py
+++ b/server.py
@ -553,6 +553,7 @@ class PromptServer():
            ram_free = comfy.model_management.get_free_memory(cpu_device)
            vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True)
            vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True)
            required_frontend_version = FrontendManager.get_required_frontend_version()
            system_stats = {
                "system": {
@ -560,6 +561,7 @@ class PromptServer():
                    "ram_total": ram_total,
                    "ram_free": ram_free,
                    "comfyui_version": __version__,
                    "required_frontend_version": required_frontend_version,
                    "python_version": sys.version,
                    "pytorch_version": comfy.model_management.torch_version,
                    "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded",
--- a/tests-unit/app_test/frontend_manager_test.py
+++ b/tests-unit/app_test/frontend_manager_test.py
@ -1,7 +1,7 @@
 import argparse
 import pytest
 from requests.exceptions import HTTPError
-from unittest.mock import patch
+from unittest.mock import patch, mock_open
 from app.frontend_management import (
    FrontendManager,
@ -172,3 +172,36 @@ def test_init_frontend_fallback_on_error():
    # Assert
    assert frontend_path == "/default/path"
    mock_check.assert_called_once()
 def test_get_frontend_version():
    # Arrange
    expected_version = "1.25.0"
    mock_requirements_content = """torch
 torchsde
 comfyui-frontend-package==1.25.0
 other-package==1.0.0
 numpy"""
    # Act
    with patch("builtins.open", mock_open(read_data=mock_requirements_content)):
        version = FrontendManager.get_required_frontend_version()
    # Assert
    assert version == expected_version
 def test_get_frontend_version_invalid_semver():
    # Arrange
    mock_requirements_content = """torch
 torchsde
 comfyui-frontend-package==1.29.3.75
 other-package==1.0.0
 numpy"""
    # Act
    with patch("builtins.open", mock_open(read_data=mock_requirements_content)):
        version = FrontendManager.get_required_frontend_version()
    # Assert
    assert version is None