Fixes to make controlnet type models work on qwen edit and kontext. (#9581)

This commit is contained in:
comfyanonymous
2025-08-27 12:26:28 -07:00
committed by GitHub
parent b20ba1f27c
commit b5ac6ed7ce
3 changed files with 8 additions and 6 deletions

View File

@@ -158,7 +158,7 @@ class Flux(nn.Module):
if i < len(control_i):
add = control_i[i]
if add is not None:
img += add
img[:, :add.shape[1]] += add
if img.dtype == torch.float16:
img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
@@ -189,7 +189,7 @@ class Flux(nn.Module):
if i < len(control_o):
add = control_o[i]
if add is not None:
img[:, txt.shape[1] :, ...] += add
img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add
img = img[:, txt.shape[1] :, ...]

View File

@@ -459,7 +459,7 @@ class QwenImageTransformer2DModel(nn.Module):
if i < len(control_i):
add = control_i[i]
if add is not None:
hidden_states += add
hidden_states[:, :add.shape[1]] += add
hidden_states = self.norm_out(hidden_states, temb)
hidden_states = self.proj_out(hidden_states)

View File

@@ -89,6 +89,7 @@ class DiffSynthCnetPatch:
self.strength = strength
self.mask = mask
self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image))
self.encoded_image_size = (image.shape[1], image.shape[2])
def encode_latent_cond(self, image):
latent_image = self.vae.encode(image)
@@ -106,14 +107,15 @@ class DiffSynthCnetPatch:
x = kwargs.get("x")
img = kwargs.get("img")
block_index = kwargs.get("block_index")
if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]:
spacial_compression = self.vae.spacial_compression_encode()
spacial_compression = self.vae.spacial_compression_encode()
if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
comfy.model_management.load_models_gpu(loaded_models)
img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength)
kwargs['img'] = img
return kwargs