Support the WAN 2.1 fun control models.

Use the new WanFunControlToVideo node.
2025-09-12 04:27:21 +00:00 · 2025-03-26 19:54:54 -04:00
parent 84fdaf7b0e
commit 3661c833bc
3 changed files with 75 additions and 7 deletions
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -992,7 +992,8 @@ class WAN21(BaseModel):

    def concat_cond(self, **kwargs):
        noise = kwargs.get("noise", None)
-        if self.diffusion_model.patch_embedding.weight.shape[1] == noise.shape[1]:
+        extra_channels = self.diffusion_model.patch_embedding.weight.shape[1] - noise.shape[1]
+        if extra_channels == 0:
            return None

        image = kwargs.get("concat_latent_image", None)
@@ -1000,12 +1001,16 @@ class WAN21(BaseModel):

        if image is None:
            image = torch.zeros_like(noise)
+            shape_image = list(noise.shape)
+            shape_image[1] = extra_channels
+            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+        else:
+            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            for i in range(0, image.shape[1], 16):
+                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
+            image = utils.resize_to_batch_size(image, noise.shape[0])

-        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-        image = self.process_latent_in(image)
-        image = utils.resize_to_batch_size(image, noise.shape[0])
-
-        if not self.image_to_video:
+        if not self.image_to_video or extra_channels == image.shape[1]:
            return image

        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))