USO style reference. (#9677)

Load the projector.safetensors file with the ModelPatchLoader node and use the siglip_vision_patch14_384.safetensors "clip vision" model and the USOStyleReferenceNode.
2025-09-10 11:35:40 +00:00 · 2025-09-02 12:36:22 -07:00
parent e2d1e5dad9
commit 3412d53b1d
5 changed files with 222 additions and 8 deletions
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -61,8 +61,12 @@ class CLIPEncoder(torch.nn.Module):
    def forward(self, x, mask=None, intermediate_output=None):
        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)

+        all_intermediate = None
        if intermediate_output is not None:
-            if intermediate_output < 0:
+            if intermediate_output == "all":
+                all_intermediate = []
+                intermediate_output = None
+            elif intermediate_output < 0:
                intermediate_output = len(self.layers) + intermediate_output

        intermediate = None
@@ -70,6 +74,12 @@ class CLIPEncoder(torch.nn.Module):
            x = l(x, mask, optimized_attention)
            if i == intermediate_output:
                intermediate = x.clone()
+            if all_intermediate is not None:
+                all_intermediate.append(x.unsqueeze(1).clone())
+
+        if all_intermediate is not None:
+            intermediate = torch.cat(all_intermediate, dim=1)
+
        return x, intermediate

 class CLIPEmbeddings(torch.nn.Module):
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -50,7 +50,13 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
+        model_type = config.get("model_type", "clip_vision_model")
+        model_class = IMAGE_ENCODERS.get(model_type)
+        if model_type == "siglip_vision_model":
+            self.return_all_hidden_states = True
+        else:
+            self.return_all_hidden_states = False
+
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
@@ -68,12 +74,18 @@ class ClipVisionModel():
    def encode_image(self, image, crop=True):
        comfy.model_management.load_model_gpu(self.patcher)
        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
-        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
+        out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)

        outputs = Output()
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
-        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+        if self.return_all_hidden_states:
+            all_hs = out[1].to(comfy.model_management.intermediate_device())
+            outputs["penultimate_hidden_states"] = all_hs[:, -2]
+            outputs["all_hidden_states"] = all_hs
+        else:
+            outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+
        outputs["mm_projected"] = out[3]
        return outputs

--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -106,6 +106,7 @@ class Flux(nn.Module):
        if y is None:
            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)

+        patches = transformer_options.get("patches", {})
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -117,9 +118,17 @@ class Flux(nn.Module):
            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

-        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
        txt = self.txt_in(txt)

+        if "post_input" in patches:
+            for p in patches["post_input"]:
+                out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
+                img = out["img"]
+                txt = out["txt"]
+                img_ids = out["img_ids"]
+                txt_ids = out["txt_ids"]
+
        if img_ids is not None:
            ids = torch.cat((txt_ids, img_ids), dim=1)
            pe = self.pe_embedder(ids)
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -433,6 +433,9 @@ class ModelPatcher:
    def set_model_double_block_patch(self, patch):
        self.set_model_patch(patch, "double_block")

+    def set_model_post_input_patch(self, patch):
+        self.set_model_patch(patch, "post_input")
+
    def add_object_patch(self, name, obj):
        self.object_patches[name] = obj