Initial ACE-Step model implementation. (#7972)

2025-09-10 19:46:38 +00:00 · 2025-05-07 05:33:34 -07:00
parent 271c9c5b9e
commit 16417b40d9
18 changed files with 19738 additions and 4 deletions
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -0,0 +1,46 @@
+import torch
+import comfy.model_management
+
+
+class TextEncodeAceStepAudio:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "tags": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "lyrics": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            }}
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning"
+
+    def encode(self, clip, tags, lyrics):
+        tokens = clip.tokenize(tags, lyrics=lyrics)
+        return (clip.encode_from_tokens_scheduled(tokens), )
+
+
+class EmptyAceStepLatentAudio:
+    def __init__(self):
+        self.device = comfy.model_management.intermediate_device()
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"seconds": ("FLOAT", {"default": 120.0, "min": 1.0, "max": 1000.0, "step": 0.1}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}),
+                             }}
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "generate"
+
+    CATEGORY = "latent/audio"
+
+    def generate(self, seconds, batch_size):
+        length = int(seconds * 44100 / 512 / 8)
+        latent = torch.zeros([batch_size, 8, 16, length], device=self.device)
+        return ({"samples": latent, "type": "audio"}, )
+
+
+NODE_CLASS_MAPPINGS = {
+    "TextEncodeAceStepAudio": TextEncodeAceStepAudio,
+    "EmptyAceStepLatentAudio": EmptyAceStepLatentAudio,
+}