Support new hunyuan video i2v model.

Use the new "v2 (replace)" guidance type in HunyuanImageToVideo and set image_interleave to 4 on the "Text Encode Hunyuan Video" node.
2025-09-15 05:57:57 +00:00 · 2025-03-08 20:25:14 -05:00
parent 0952569493
commit 7395b0c0d1
4 changed files with 72 additions and 24 deletions
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -227,6 +227,7 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
+        guiding_frame_index=None,
        control=None,
        transformer_options={},
    ) -> Tensor:
@@ -237,7 +238,15 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))

-        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+        if guiding_frame_index is not None:
+            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
+            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
+            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
+            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
+            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
+        else:
+            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+            modulation_dims = None

        if self.params.guidance_embed:
            if guidance is not None:
@@ -271,7 +280,7 @@ class HunyuanVideo(nn.Module):
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)

            if control is not None: # Controlnet
                control_i = control.get("input")
@@ -292,7 +301,7 @@ class HunyuanVideo(nn.Module):
                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)

            if control is not None: # Controlnet
                control_o = control.get("output")
@@ -303,7 +312,7 @@ class HunyuanVideo(nn.Module):

        img = img[:, : img_len]

-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)

        shape = initial_shape[-3:]
        for i in range(len(shape)):
@@ -313,7 +322,7 @@ class HunyuanVideo(nn.Module):
        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
        return img

-    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
@@ -325,5 +334,5 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, control, transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, control, transformer_options)
        return out