Fixes to make controlnet type models work on qwen edit and kontext. (#9581)

2025-09-10 11:35:40 +00:00 · 2025-08-27 12:26:28 -07:00
parent b20ba1f27c
commit b5ac6ed7ce
3 changed files with 8 additions and 6 deletions
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -158,7 +158,7 @@ class Flux(nn.Module):
                if i < len(control_i):
                    add = control_i[i]
                    if add is not None:
-                        img += add
+                        img[:, :add.shape[1]] += add

        if img.dtype == torch.float16:
            img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
@@ -189,7 +189,7 @@ class Flux(nn.Module):
                if i < len(control_o):
                    add = control_o[i]
                    if add is not None:
-                        img[:, txt.shape[1] :, ...] += add
+                        img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add

        img = img[:, txt.shape[1] :, ...]

--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -459,7 +459,7 @@ class QwenImageTransformer2DModel(nn.Module):
                if i < len(control_i):
                    add = control_i[i]
                    if add is not None:
-                        hidden_states += add
+                        hidden_states[:, :add.shape[1]] += add

        hidden_states = self.norm_out(hidden_states, temb)
        hidden_states = self.proj_out(hidden_states)
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -89,6 +89,7 @@ class DiffSynthCnetPatch:
        self.strength = strength
        self.mask = mask
        self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image))
+        self.encoded_image_size = (image.shape[1], image.shape[2])

    def encode_latent_cond(self, image):
        latent_image = self.vae.encode(image)
@@ -106,14 +107,15 @@ class DiffSynthCnetPatch:
        x = kwargs.get("x")
        img = kwargs.get("img")
        block_index = kwargs.get("block_index")
-        if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]:
-            spacial_compression = self.vae.spacial_compression_encode()
+        spacial_compression = self.vae.spacial_compression_encode()
+        if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
            image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
            loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
            self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
+            self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
            comfy.model_management.load_models_gpu(loaded_models)

-        img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
+        img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength)
        kwargs['img'] = img
        return kwargs