Support stable zero 123 model.

To use it use the ImageOnlyCheckpointLoader to load the checkpoint and the new Stable_Zero123 node.
2025-09-11 20:17:30 +00:00 · 2023-12-18 03:18:40 -05:00
parent 2f9d6a97ec
commit 2258f85159
5 changed files with 119 additions and 2 deletions
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -328,3 +328,33 @@ class SVD_img2vid(BaseModel):
        out['image_only_indicator'] = comfy.conds.CONDConstant(torch.zeros((1,), device=device))
        out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
        return out
+
+class Stable_Zero123(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
+        super().__init__(model_config, model_type, device=device)
+        self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
+        self.cc_projection.weight.copy_(cc_projection_weight)
+        self.cc_projection.bias.copy_(cc_projection_bias)
+
+    def extra_conds(self, **kwargs):
+        out = {}
+
+        latent_image = kwargs.get("concat_latent_image", None)
+        noise = kwargs.get("noise", None)
+
+        if latent_image is None:
+            latent_image = torch.zeros_like(noise)
+
+        if latent_image.shape[1:] != noise.shape[1:]:
+            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
+
+        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
+
+        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            if cross_attn.shape[-1] != 768:
+                cross_attn = self.cc_projection(cross_attn)
+            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
+        return out