mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-07-27 08:16:44 +00:00
166 lines
7.2 KiB
Python
166 lines
7.2 KiB
Python
from __future__ import annotations
|
|
|
|
import torch
|
|
|
|
import comfy.utils
|
|
import nodes
|
|
from comfy_api.latest import io
|
|
|
|
|
|
def camera_embeddings(elevation, azimuth):
|
|
elevation = torch.as_tensor([elevation])
|
|
azimuth = torch.as_tensor([azimuth])
|
|
embeddings = torch.stack(
|
|
[
|
|
torch.deg2rad(
|
|
(90 - elevation) - 90
|
|
), # Zero123 polar is 90-elevation
|
|
torch.sin(torch.deg2rad(azimuth)),
|
|
torch.cos(torch.deg2rad(azimuth)),
|
|
torch.deg2rad(
|
|
90 - torch.full_like(elevation, 0)
|
|
),
|
|
], dim=-1).unsqueeze(1)
|
|
|
|
return embeddings
|
|
|
|
|
|
class StableZero123_Conditioning(io.ComfyNode):
|
|
@classmethod
|
|
def define_schema(cls):
|
|
return io.Schema(
|
|
node_id="StableZero123_Conditioning_V3",
|
|
category="conditioning/3d_models",
|
|
inputs=[
|
|
io.ClipVision.Input("clip_vision"),
|
|
io.Image.Input("init_image"),
|
|
io.Vae.Input("vae"),
|
|
io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
|
io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
|
|
io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False)
|
|
],
|
|
outputs=[
|
|
io.Conditioning.Output(display_name="positive"),
|
|
io.Conditioning.Output(display_name="negative"),
|
|
io.Latent.Output(display_name="latent")
|
|
]
|
|
)
|
|
|
|
@classmethod
|
|
def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth):
|
|
output = clip_vision.encode_image(init_image)
|
|
pooled = output.image_embeds.unsqueeze(0)
|
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
|
|
encode_pixels = pixels[:,:,:,:3]
|
|
t = vae.encode(encode_pixels)
|
|
cam_embeds = camera_embeddings(elevation, azimuth)
|
|
cond = torch.cat([pooled, cam_embeds.to(pooled.device).repeat((pooled.shape[0], 1, 1))], dim=-1)
|
|
|
|
positive = [[cond, {"concat_latent_image": t}]]
|
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]]
|
|
latent = torch.zeros([batch_size, 4, height // 8, width // 8])
|
|
return io.NodeOutput(positive, negative, {"samples":latent})
|
|
|
|
|
|
class StableZero123_Conditioning_Batched(io.ComfyNode):
|
|
@classmethod
|
|
def define_schema(cls):
|
|
return io.Schema(
|
|
node_id="StableZero123_Conditioning_Batched_V3",
|
|
category="conditioning/3d_models",
|
|
inputs=[
|
|
io.ClipVision.Input("clip_vision"),
|
|
io.Image.Input("init_image"),
|
|
io.Vae.Input("vae"),
|
|
io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
|
io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
|
|
io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
|
|
io.Float.Input("elevation_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False),
|
|
io.Float.Input("azimuth_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False)
|
|
],
|
|
outputs=[
|
|
io.Conditioning.Output(display_name="positive"),
|
|
io.Conditioning.Output(display_name="negative"),
|
|
io.Latent.Output(display_name="latent")
|
|
]
|
|
)
|
|
|
|
@classmethod
|
|
def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment):
|
|
output = clip_vision.encode_image(init_image)
|
|
pooled = output.image_embeds.unsqueeze(0)
|
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
|
|
encode_pixels = pixels[:,:,:,:3]
|
|
t = vae.encode(encode_pixels)
|
|
|
|
cam_embeds = []
|
|
for i in range(batch_size):
|
|
cam_embeds.append(camera_embeddings(elevation, azimuth))
|
|
elevation += elevation_batch_increment
|
|
azimuth += azimuth_batch_increment
|
|
|
|
cam_embeds = torch.cat(cam_embeds, dim=0)
|
|
cond = torch.cat([comfy.utils.repeat_to_batch_size(pooled, batch_size), cam_embeds], dim=-1)
|
|
|
|
positive = [[cond, {"concat_latent_image": t}]]
|
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]]
|
|
latent = torch.zeros([batch_size, 4, height // 8, width // 8])
|
|
return io.NodeOutput(positive, negative, {"samples":latent, "batch_index": [0] * batch_size})
|
|
|
|
|
|
class SV3D_Conditioning(io.ComfyNode):
|
|
@classmethod
|
|
def define_schema(cls):
|
|
return io.Schema(
|
|
node_id="SV3D_Conditioning_V3",
|
|
category="conditioning/3d_models",
|
|
inputs=[
|
|
io.ClipVision.Input("clip_vision"),
|
|
io.Image.Input("init_image"),
|
|
io.Vae.Input("vae"),
|
|
io.Int.Input("width", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("height", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
|
io.Int.Input("video_frames", default=21, min=1, max=4096),
|
|
io.Float.Input("elevation", default=0.0, min=-90.0, max=90.0, step=0.1, round=False)
|
|
],
|
|
outputs=[
|
|
io.Conditioning.Output(display_name="positive"),
|
|
io.Conditioning.Output(display_name="negative"),
|
|
io.Latent.Output(display_name="latent")
|
|
]
|
|
)
|
|
|
|
@classmethod
|
|
def execute(cls, clip_vision, init_image, vae, width, height, video_frames, elevation):
|
|
output = clip_vision.encode_image(init_image)
|
|
pooled = output.image_embeds.unsqueeze(0)
|
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
|
|
encode_pixels = pixels[:,:,:,:3]
|
|
t = vae.encode(encode_pixels)
|
|
|
|
azimuth = 0
|
|
azimuth_increment = 360 / (max(video_frames, 2) - 1)
|
|
|
|
elevations = []
|
|
azimuths = []
|
|
for i in range(video_frames):
|
|
elevations.append(elevation)
|
|
azimuths.append(azimuth)
|
|
azimuth += azimuth_increment
|
|
|
|
positive = [[pooled, {"concat_latent_image": t, "elevation": elevations, "azimuth": azimuths}]]
|
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t), "elevation": elevations, "azimuth": azimuths}]]
|
|
latent = torch.zeros([video_frames, 4, height // 8, width // 8])
|
|
return io.NodeOutput(positive, negative, {"samples":latent})
|
|
|
|
|
|
NODES_LIST = [
|
|
StableZero123_Conditioning,
|
|
StableZero123_Conditioning_Batched,
|
|
SV3D_Conditioning,
|
|
]
|