Disable autocast in unet for increased speed.

2025-09-15 05:57:57 +00:00 · 2023-07-05 20:58:44 -04:00
parent 603f02d613
commit ddc6f12ad5
9 changed files with 84 additions and 79 deletions
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -278,7 +278,7 @@ class CrossAttentionDoggettx(nn.Module):
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
        del q_in, k_in, v_in

-        r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
+        r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)

        mem_free_total = model_management.get_free_memory(q.device)

@@ -314,7 +314,7 @@ class CrossAttentionDoggettx(nn.Module):
                        s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * self.scale
                    first_op_done = True

-                    s2 = s1.softmax(dim=-1)
+                    s2 = s1.softmax(dim=-1).to(v.dtype)
                    del s1

                    r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -220,7 +220,7 @@ class ResBlock(TimestepBlock):
        self.use_scale_shift_norm = use_scale_shift_norm

        self.in_layers = nn.Sequential(
-            normalization(channels, dtype=dtype),
+            nn.GroupNorm(32, channels, dtype=dtype),
            nn.SiLU(),
            conv_nd(dims, channels, self.out_channels, 3, padding=1, dtype=dtype),
        )
@@ -244,7 +244,7 @@ class ResBlock(TimestepBlock):
            ),
        )
        self.out_layers = nn.Sequential(
-            normalization(self.out_channels, dtype=dtype),
+            nn.GroupNorm(32, self.out_channels, dtype=dtype),
            nn.SiLU(),
            nn.Dropout(p=dropout),
            zero_module(
@@ -778,13 +778,13 @@ class UNetModel(nn.Module):
                self._feature_size += ch

        self.out = nn.Sequential(
-            normalization(ch, dtype=self.dtype),
+            nn.GroupNorm(32, ch, dtype=self.dtype),
            nn.SiLU(),
            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype)),
        )
        if self.predict_codebook_ids:
            self.id_predictor = nn.Sequential(
-            normalization(ch),
+            nn.GroupNorm(32, ch, dtype=self.dtype),
            conv_nd(dims, model_channels, n_embed, 1),
            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
        )
@@ -821,7 +821,7 @@ class UNetModel(nn.Module):
            self.num_classes is not None
        ), "must specify y if and only if the model is class-conditional"
        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(self.dtype)
        emb = self.time_embed(t_emb)

        if self.num_classes is not None:
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
@@ -84,7 +84,7 @@ def _summarize_chunk(
    max_score, _ = torch.max(attn_weights, -1, keepdim=True)
    max_score = max_score.detach()
    torch.exp(attn_weights - max_score, out=attn_weights)
-    exp_weights = attn_weights
+    exp_weights = attn_weights.to(value.dtype)
    exp_values = torch.bmm(exp_weights, value)
    max_score = max_score.squeeze(-1)
    return AttnChunk(exp_values, exp_weights.sum(dim=-1), max_score)
@@ -166,7 +166,7 @@ def _get_attention_scores_no_kv_chunking(
        attn_scores /= summed
        attn_probs = attn_scores

-    hidden_states_slice = torch.bmm(attn_probs, value)
+    hidden_states_slice = torch.bmm(attn_probs.to(value.dtype), value)
    return hidden_states_slice

 class ScannedChunk(NamedTuple):