Uncap cosmos predict2 res and fix mem estimation. (#8518)

2025-08-02 03:07:07 +00:00 · 2025-06-13 04:30:18 -07:00 · 2025-06-13 04:30:18 -07:00 · c69af655aa
commit c69af655aa
parent 251f54a2ad
2 changed files with 11 additions and 10 deletions
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@ -72,7 +72,6 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
    ):
        del kwargs
        super().__init__()
-        self.register_buffer("seq", torch.arange(max(len_h, len_w, len_t), dtype=torch.float, device=device))
        self.base_fps = base_fps
        self.max_h = len_h
        self.max_w = len_w
@ -134,21 +133,19 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))

        B, T, H, W, _ = B_T_H_W_C
+        seq = torch.arange(max(H, W, T), dtype=torch.float, device=device)
        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
        assert (
            uniform_fps or B == 1 or T == 1
        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
-        assert (
-            H <= self.max_h and W <= self.max_w
-        ), f"Input dimensions (H={H}, W={W}) exceed the maximum dimensions (max_h={self.max_h}, max_w={self.max_w})"
-        half_emb_h = torch.outer(self.seq[:H].to(device=device), h_spatial_freqs)
-        half_emb_w = torch.outer(self.seq[:W].to(device=device), w_spatial_freqs)
+        half_emb_h = torch.outer(seq[:H].to(device=device), h_spatial_freqs)
+        half_emb_w = torch.outer(seq[:W].to(device=device), w_spatial_freqs)

        # apply sequence scaling in temporal dimension
        if fps is None or self.enable_fps_modulation is False:  # image case
-            half_emb_t = torch.outer(self.seq[:T].to(device=device), temporal_freqs)
+            half_emb_t = torch.outer(seq[:T].to(device=device), temporal_freqs)
        else:
-            half_emb_t = torch.outer(self.seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
+            half_emb_t = torch.outer(seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)

        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -923,9 +923,13 @@ class CosmosT2IPredict2(supported_models_base.BASE):
    unet_extra_config = {}
    latent_format = latent_formats.Wan21

-    memory_usage_factor = 1.6 #TODO
+    memory_usage_factor = 1.0

-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] #TODO
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9

    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.CosmosPredict2(self, device=device)