Support LTXV 0.9.5.

Credits: Lightricks team.
2025-09-14 05:25:23 +00:00 · 2025-03-05 00:13:49 -05:00
parent 745b13649b
commit 93fedd92fe
11 changed files with 661 additions and 141 deletions
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,16 +6,29 @@ from einops import rearrange
 from torch import Tensor


-def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
-    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-    dims_to_append = target_dims - x.ndim
-    if dims_to_append < 0:
-        raise ValueError(
-            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
-        )
-    elif dims_to_append == 0:
-        return x
-    return x[(...,) + (None,) * dims_to_append]
+def latent_to_pixel_coords(
+    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
+) -> Tensor:
+    """
+    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+    configuration.
+    Args:
+        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+        containing the latent corner coordinates of each token.
+        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
+        causal_fix (bool): Whether to take into account the different temporal scale
+            of the first frame. Default = False for backwards compatibility.
+    Returns:
+        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
+    """
+    pixel_coords = (
+        latent_coords
+        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
+    )
+    if causal_fix:
+        # Fix temporal scale for first frame to 1 due to causality
+        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords


 class Patchifier(ABC):
@@ -44,29 +57,26 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size

-    def get_grid(
-        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
+    def get_latent_coords(
+        self, latent_num_frames, latent_height, latent_width, batch_size, device
    ):
-        f = orig_num_frames // self._patch_size[0]
-        h = orig_height // self._patch_size[1]
-        w = orig_width // self._patch_size[2]
-        grid_h = torch.arange(h, dtype=torch.float32, device=device)
-        grid_w = torch.arange(w, dtype=torch.float32, device=device)
-        grid_f = torch.arange(f, dtype=torch.float32, device=device)
-        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
-        grid = torch.stack(grid, dim=0)
-        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-
-        if scale_grid is not None:
-            for i in range(3):
-                if isinstance(scale_grid[i], Tensor):
-                    scale = append_dims(scale_grid[i], grid.ndim - 1)
-                else:
-                    scale = scale_grid[i]
-                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
-
-        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
-        return grid
+        """
+        Return a tensor of shape [batch_size, 3, num_patches] containing the
+            top-left corner latent coordinates of each latent patch.
+        The tensor is repeated for each batch element.
+        """
+        latent_sample_coords = torch.meshgrid(
+            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+            torch.arange(0, latent_height, self._patch_size[1], device=device),
+            torch.arange(0, latent_width, self._patch_size[2], device=device),
+            indexing="ij",
+        )
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        )
+        return latent_coords


 class SymmetricPatchifier(Patchifier):
@@ -74,6 +84,8 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
+        b, _, f, h, w = latents.shape
+        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -81,7 +93,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents
+        return latents, latent_coords

    def unpatchify(
        self,