Cleanup chroma PR.

2025-09-12 04:27:21 +00:00 · 2025-04-30 20:57:30 -04:00
parent 4ca3d84277
commit 08ff5fa08a
9 changed files with 25 additions and 181 deletions
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -787,8 +787,8 @@ class PixArt(BaseModel):
        return out

 class Flux(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.flux.model.Flux):
+        super().__init__(model_config, model_type, device=device, unet_model=unet_model)

    def concat_cond(self, **kwargs):
        try:
@@ -1110,63 +1110,14 @@ class HiDream(BaseModel):
            out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
        return out

-class Chroma(BaseModel):
+class Chroma(Flux):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma)

-    def concat_cond(self, **kwargs):
-        try:
-            #Handle Flux control loras dynamically changing the img_in weight.
-            num_channels = self.diffusion_model.img_in.weight.shape[1]
-        except:
-            #Some cases like tensorrt might not have the weights accessible
-            num_channels = self.model_config.unet_config["in_channels"]
-
-        out_channels = self.model_config.unet_config["out_channels"]
-
-        if num_channels <= out_channels:
-            return None
-
-        image = kwargs.get("concat_latent_image", None)
-        noise = kwargs.get("noise", None)
-        device = kwargs["device"]
-
-        if image is None:
-            image = torch.zeros_like(noise)
-
-        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-        image = utils.resize_to_batch_size(image, noise.shape[0])
-        image = self.process_latent_in(image)
-        if num_channels <= out_channels * 2:
-            return image
-
-        #inpaint model
-        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
-        if mask is None:
-            mask = torch.ones_like(noise)[:, :1]
-
-        mask = torch.mean(mask, dim=1, keepdim=True)
-        mask = utils.common_upscale(mask.to(device), noise.shape[-1] * 8, noise.shape[-2] * 8, "bilinear", "center")
-        mask = mask.view(mask.shape[0], mask.shape[2] // 8, 8, mask.shape[3] // 8, 8).permute(0, 2, 4, 1, 3).reshape(mask.shape[0], -1, mask.shape[2] // 8, mask.shape[3] // 8)
-        mask = utils.resize_to_batch_size(mask, noise.shape[0])
-        return torch.cat((image, mask), dim=1)
-
-
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        # upscale the attention mask, since now we
-        attention_mask = kwargs.get("attention_mask", None)
-        if attention_mask is not None:
-            shape = kwargs["noise"].shape
-            mask_ref_size = kwargs["attention_mask_img_shape"]
-            # the model will pad to the patch size, and then divide
-            # essentially dividing and rounding up
-            (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
-            attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
-            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
-        guidance = 0.0
-        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor((guidance,)))
+
+        guidance = kwargs.get("guidance", 0)
+        if guidance is not None:
+            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out