Wan2.2 fun control support. (#9292)

2025-09-10 11:35:40 +00:00 · 2025-08-12 20:26:33 -07:00
parent e1d4f36d8d
commit 560d38f34c
4 changed files with 91 additions and 1 deletions
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -391,6 +391,7 @@ class WanModel(torch.nn.Module):
                 cross_attn_norm=True,
                 eps=1e-6,
                 flf_pos_embed_token_number=None,
+                 in_dim_ref_conv=None,
                 image_model=None,
                 device=None,
                 dtype=None,
@@ -484,6 +485,11 @@ class WanModel(torch.nn.Module):
        else:
            self.img_emb = None

+        if in_dim_ref_conv is not None:
+            self.ref_conv = operations.Conv2d(in_dim_ref_conv, dim, kernel_size=patch_size[1:], stride=patch_size[1:], device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        else:
+            self.ref_conv = None
+
    def forward_orig(
        self,
        x,
@@ -526,6 +532,13 @@ class WanModel(torch.nn.Module):
        e = e.reshape(t.shape[0], -1, e.shape[-1])
        e0 = self.time_projection(e).unflatten(2, (6, self.dim))

+        full_ref = None
+        if self.ref_conv is not None:
+            full_ref = kwargs.get("reference_latent", None)
+            if full_ref is not None:
+                full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
+                x = torch.concat((full_ref, x), dim=1)
+
        # context
        context = self.text_embedding(context)

@@ -552,6 +565,9 @@ class WanModel(torch.nn.Module):
        # head
        x = self.head(x, e)

+        if full_ref is not None:
+            x = x[:, full_ref.shape[1]:]
+
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        return x
@@ -570,6 +586,9 @@ class WanModel(torch.nn.Module):
            x = torch.cat([x, time_dim_concat], dim=2)
            t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0])

+        if self.ref_conv is not None and "reference_latent" in kwargs:
+            t_len += 1
+
        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1124,7 +1124,11 @@ class WAN21(BaseModel):
                mask = mask.repeat(1, 4, 1, 1, 1)
            mask = utils.resize_to_batch_size(mask, noise.shape[0])

-        return torch.cat((mask, image), dim=1)
+        concat_mask_index = kwargs.get("concat_mask_index", 0)
+        if concat_mask_index != 0:
+            return torch.cat((image[:, :concat_mask_index], mask, image[:, concat_mask_index:]), dim=1)
+        else:
+            return torch.cat((mask, image), dim=1)

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
@@ -1140,6 +1144,10 @@ class WAN21(BaseModel):
        if time_dim_concat is not None:
            out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat))

+        reference_latents = kwargs.get("reference_latents", None)
+        if reference_latents is not None:
+            out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])[:, :, 0])
+
        return out


--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -373,6 +373,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix))
        if flf_weight is not None:
            dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1]
+
+        ref_conv_weight = state_dict.get('{}ref_conv.weight'.format(key_prefix))
+        if ref_conv_weight is not None:
+            dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1]
+
        return dit_config

    if '{}latent_in.weight'.format(key_prefix) in state_dict_keys:  # Hunyuan 3D