ComfyUI/comfy_extras/v3/nodes_stable3d.py

from __future__ import annotations

import torch

import comfy.utils
import nodes
from comfy_api.latest import io


def camera_embeddings(elevation, azimuth):
    elevation = torch.as_tensor([elevation])
    azimuth = torch.as_tensor([azimuth])
    embeddings = torch.stack(
        [
                torch.deg2rad(
                    (90 - elevation) - 90
                ),  # Zero123 polar is 90-elevation
                torch.sin(torch.deg2rad(azimuth)),
                torch.cos(torch.deg2rad(azimuth)),
                torch.deg2rad(
                    90 - torch.full_like(elevation, 0)
                ),
        ], dim=-1).unsqueeze(1)

    return embeddings


class StableZero123_Conditioning(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="StableZero123_Conditioning_V3",
            category="conditioning/3d_models",
            inputs=[
                io.ClipVision.Input("clip_vision"),
                io.Image.Input("init_image"),
                io.Vae.Input("vae"),
                io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
                io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
                io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False)
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent")
            ]
        )

    @classmethod
    def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth):
        output = clip_vision.encode_image(init_image)
        pooled = output.image_embeds.unsqueeze(0)
        pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
        encode_pixels = pixels[:,:,:,:3]
        t = vae.encode(encode_pixels)
        cam_embeds = camera_embeddings(elevation, azimuth)
        cond = torch.cat([pooled, cam_embeds.to(pooled.device).repeat((pooled.shape[0], 1, 1))], dim=-1)

        positive = [[cond, {"concat_latent_image": t}]]
        negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]]
        latent = torch.zeros([batch_size, 4, height // 8, width // 8])
        return io.NodeOutput(positive, negative, {"samples":latent})


class StableZero123_Conditioning_Batched(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="StableZero123_Conditioning_Batched_V3",
            category="conditioning/3d_models",
            inputs=[
                io.ClipVision.Input("clip_vision"),
                io.Image.Input("init_image"),
                io.Vae.Input("vae"),
                io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
                io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
                io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
                io.Float.Input("elevation_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
                io.Float.Input("azimuth_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False)
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent")
            ]
        )

    @classmethod
    def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment):
        output = clip_vision.encode_image(init_image)
        pooled = output.image_embeds.unsqueeze(0)
        pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
        encode_pixels = pixels[:,:,:,:3]
        t = vae.encode(encode_pixels)

        cam_embeds = []
        for i in range(batch_size):
            cam_embeds.append(camera_embeddings(elevation, azimuth))
            elevation += elevation_batch_increment
            azimuth += azimuth_batch_increment

        cam_embeds = torch.cat(cam_embeds, dim=0)
        cond = torch.cat([comfy.utils.repeat_to_batch_size(pooled, batch_size), cam_embeds], dim=-1)

        positive = [[cond, {"concat_latent_image": t}]]
        negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]]
        latent = torch.zeros([batch_size, 4, height // 8, width // 8])
        return io.NodeOutput(positive, negative, {"samples":latent, "batch_index": [0] * batch_size})


class SV3D_Conditioning(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="SV3D_Conditioning_V3",
            category="conditioning/3d_models",
            inputs=[
                io.ClipVision.Input("clip_vision"),
                io.Image.Input("init_image"),
                io.Vae.Input("vae"),
                io.Int.Input("width", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("height", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("video_frames", default=21, min=1, max=4096),
                io.Float.Input("elevation", default=0.0, min=-90.0, max=90.0, step=0.1, round=False)
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent")
            ]
        )

    @classmethod
    def execute(cls, clip_vision, init_image, vae, width, height, video_frames, elevation):
        output = clip_vision.encode_image(init_image)
        pooled = output.image_embeds.unsqueeze(0)
        pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
        encode_pixels = pixels[:,:,:,:3]
        t = vae.encode(encode_pixels)

        azimuth = 0
        azimuth_increment = 360 / (max(video_frames, 2) - 1)

        elevations = []
        azimuths = []
        for i in range(video_frames):
            elevations.append(elevation)
            azimuths.append(azimuth)
            azimuth += azimuth_increment

        positive = [[pooled, {"concat_latent_image": t, "elevation": elevations, "azimuth": azimuths}]]
        negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t), "elevation": elevations, "azimuth": azimuths}]]
        latent = torch.zeros([video_frames, 4, height // 8, width // 8])
        return io.NodeOutput(positive, negative, {"samples":latent})


NODES_LIST = [
    StableZero123_Conditioning,
    StableZero123_Conditioning_Batched,
    SV3D_Conditioning,
]