From bed60d6ed9c28687bfdc1e89d2bb84a22fa905f0 Mon Sep 17 00:00:00 2001 From: bigcat88 Date: Wed, 23 Jul 2025 10:13:47 +0300 Subject: [PATCH] refactored Preview/Save of audios --- comfy_api/v3/ui.py | 247 +++++++++++++++++++-------------- comfy_extras/v3/nodes_audio.py | 126 ++--------------- 2 files changed, 161 insertions(+), 212 deletions(-) diff --git a/comfy_api/v3/ui.py b/comfy_api/v3/ui.py index caaf894f6..9dc6f43c0 100644 --- a/comfy_api/v3/ui.py +++ b/comfy_api/v3/ui.py @@ -51,6 +51,16 @@ class SavedImages(_UIOutput): return data +class SavedAudios(_UIOutput): + """UI wrapper around one or more audio files on disk (FLAC / MP3 / Opus).""" + def __init__(self, results: list[SavedResult]): + super().__init__() + self.results = results + + def as_dict(self) -> dict: + return {"audio": self.results} + + def _get_directory_by_folder_type(folder_type: FolderType) -> str: if folder_type == FolderType.input: return folder_paths.get_input_directory() @@ -243,8 +253,134 @@ class ImageSaveHelper: return SavedImages([result], is_animated=len(images) > 1) +class AudioSaveHelper: + """A helper class with static methods to handle audio saving and metadata.""" + _OPUS_RATES = [8000, 12000, 16000, 24000, 48000] + + @staticmethod + def save_audio( + audio: dict, + filename_prefix: str, + folder_type: FolderType, + cls: Type[ComfyNodeV3] | None, + format: str = "flac", + quality: str = "128k", + ) -> list[SavedResult]: + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type) + ) + + metadata = {} + if not args.disable_metadata and cls is not None: + if cls.hidden.prompt is not None: + metadata["prompt"] = json.dumps(cls.hidden.prompt) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) + + results = [] + for batch_number, waveform in enumerate(audio["waveform"].cpu()): + filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) + file = f"{filename_with_batch_num}_{counter:05}_.{format}" + output_path = os.path.join(full_output_folder, file) + + # Use original sample rate initially + sample_rate = audio["sample_rate"] + + # Handle Opus sample rate requirements + if format == "opus": + if sample_rate > 48000: + sample_rate = 48000 + elif sample_rate not in AudioSaveHelper._OPUS_RATES: + # Find the next highest supported rate + for rate in sorted(AudioSaveHelper._OPUS_RATES): + if rate > sample_rate: + sample_rate = rate + break + if sample_rate not in AudioSaveHelper._OPUS_RATES: # Fallback if still not supported + sample_rate = 48000 + + # Resample if necessary + if sample_rate != audio["sample_rate"]: + waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) + + # Create output with specified format + output_buffer = BytesIO() + output_container = av.open(output_buffer, mode="w", format=format) + + # Set metadata on the container + for key, value in metadata.items(): + output_container.metadata[key] = value + + # Set up the output stream with appropriate properties + if format == "opus": + out_stream = output_container.add_stream("libopus", rate=sample_rate) + if quality == "64k": + out_stream.bit_rate = 64000 + elif quality == "96k": + out_stream.bit_rate = 96000 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "192k": + out_stream.bit_rate = 192000 + elif quality == "320k": + out_stream.bit_rate = 320000 + elif format == "mp3": + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + if quality == "V0": + # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool + out_stream.codec_context.qscale = 1 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "320k": + out_stream.bit_rate = 320000 + else: # format == "flac": + out_stream = output_container.add_stream("flac", rate=sample_rate) + + frame = av.AudioFrame.from_ndarray( + waveform.movedim(0, 1).reshape(1, -1).float().numpy(), + format="flt", + layout="mono" if waveform.shape[0] == 1 else "stereo", + ) + frame.sample_rate = sample_rate + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + + # Flush encoder + output_container.mux(out_stream.encode(None)) + + # Close containers + output_container.close() + + # Write the output to file + output_buffer.seek(0) + with open(output_path, "wb") as f: + f.write(output_buffer.getbuffer()) + + results.append(SavedResult(file, subfolder, folder_type)) + counter += 1 + + return results + + @staticmethod + def get_save_audio_ui( + audio, filename_prefix: str, cls: Type[ComfyNodeV3] | None, format: str = "flac", quality: str = "128k", + ) -> SavedAudios: + """Save and instantly wrap for UI.""" + return SavedAudios( + AudioSaveHelper.save_audio( + audio, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + format=format, + quality=quality, + ) + ) + + class PreviewImage(_UIOutput): - def __init__(self, image: Image.Type, animated: bool=False, cls: ComfyNodeV3=None, **kwargs): + def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNodeV3] = None, **kwargs): self.values = ImageSaveHelper.save_images( image, filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)), @@ -316,108 +452,17 @@ class PreviewMask(PreviewImage): class PreviewAudio(_UIOutput): - def __init__(self, audio, cls: ComfyNodeV3=None, **kwargs): - quality = "128k" - format = "flac" - - filename_prefix = "ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5)) - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( - filename_prefix, folder_paths.get_temp_directory() + def __init__(self, audio: dict, cls: Type[ComfyNodeV3] = None, **kwargs): + self.values = AudioSaveHelper.save_audio( + audio, + filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)), + folder_type=FolderType.temp, + cls=cls, + format="flac", + quality="128k", ) - # Prepare metadata dictionary - metadata = {} - if not args.disable_metadata and cls is not None: - if cls.hidden.prompt is not None: - metadata["prompt"] = json.dumps(cls.hidden.prompt) - if cls.hidden.extra_pnginfo is not None: - for x in cls.hidden.extra_pnginfo: - metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) - - # Opus supported sample rates - OPUS_RATES = [8000, 12000, 16000, 24000, 48000] - results = [] - for (batch_number, waveform) in enumerate(audio["waveform"].cpu()): - filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) - file = f"{filename_with_batch_num}_{counter:05}_.{format}" - output_path = os.path.join(full_output_folder, file) - - # Use original sample rate initially - sample_rate = audio["sample_rate"] - - # Handle Opus sample rate requirements - if format == "opus": - if sample_rate > 48000: - sample_rate = 48000 - elif sample_rate not in OPUS_RATES: - # Find the next highest supported rate - for rate in sorted(OPUS_RATES): - if rate > sample_rate: - sample_rate = rate - break - if sample_rate not in OPUS_RATES: # Fallback if still not supported - sample_rate = 48000 - - # Resample if necessary - if sample_rate != audio["sample_rate"]: - waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) - - # Create output with specified format - output_buffer = BytesIO() - output_container = av.open(output_buffer, mode='w', format=format) - - # Set metadata on the container - for key, value in metadata.items(): - output_container.metadata[key] = value - - # Set up the output stream with appropriate properties - if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate) - if quality == "64k": - out_stream.bit_rate = 64000 - elif quality == "96k": - out_stream.bit_rate = 96000 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "192k": - out_stream.bit_rate = 192000 - elif quality == "320k": - out_stream.bit_rate = 320000 - elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) - if quality == "V0": - # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool - out_stream.codec_context.qscale = 1 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "320k": - out_stream.bit_rate = 320000 - else: # format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate) - - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', - layout='mono' if waveform.shape[0] == 1 else 'stereo') - frame.sample_rate = sample_rate - frame.pts = 0 - output_container.mux(out_stream.encode(frame)) - - # Flush encoder - output_container.mux(out_stream.encode(None)) - - # Close containers - output_container.close() - - # Write the output to file - output_buffer.seek(0) - with open(output_path, 'wb') as f: - f.write(output_buffer.getbuffer()) - - results.append(SavedResult(file, subfolder, FolderType.temp)) - counter += 1 - - self.values = results - - def as_dict(self): + def as_dict(self) -> dict: return {"audio": self.values} diff --git a/comfy_extras/v3/nodes_audio.py b/comfy_extras/v3/nodes_audio.py index 80c502df2..394709d1b 100644 --- a/comfy_extras/v3/nodes_audio.py +++ b/comfy_extras/v3/nodes_audio.py @@ -1,18 +1,14 @@ from __future__ import annotations import hashlib -import json import os -from io import BytesIO -import av import torch import torchaudio import comfy.model_management import folder_paths import node_helpers -from comfy.cli_args import args from comfy_api.v3 import io, ui @@ -142,8 +138,12 @@ class SaveAudioMP3(io.ComfyNodeV3): ) @classmethod - def execute(self, audio, filename_prefix="ComfyUI", format="mp3", quality="V0") -> io.NodeOutput: - return _save_audio(self, audio, filename_prefix, format, quality) + def execute(cls, audio, filename_prefix="ComfyUI", format="mp3", quality="V0") -> io.NodeOutput: + return io.NodeOutput( + ui=ui.AudioSaveHelper.get_save_audio_ui( + audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality + ) + ) class SaveAudioOpus(io.ComfyNodeV3): @@ -163,8 +163,12 @@ class SaveAudioOpus(io.ComfyNodeV3): ) @classmethod - def execute(self, audio, filename_prefix="ComfyUI", format="opus", quality="128k") -> io.NodeOutput: - return _save_audio(self, audio, filename_prefix, format, quality) + def execute(cls, audio, filename_prefix="ComfyUI", format="opus", quality="128k") -> io.NodeOutput: + return io.NodeOutput( + ui=ui.AudioSaveHelper.get_save_audio_ui( + audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality + ) + ) class SaveAudio(io.ComfyNodeV3): @@ -184,7 +188,9 @@ class SaveAudio(io.ComfyNodeV3): @classmethod def execute(cls, audio, filename_prefix="ComfyUI", format="flac") -> io.NodeOutput: - return _save_audio(cls, audio, filename_prefix, format) + return io.NodeOutput( + ui=ui.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=format) + ) class VAEDecodeAudio(io.ComfyNodeV3): @@ -232,108 +238,6 @@ class VAEEncodeAudio(io.ComfyNodeV3): return io.NodeOutput({"samples": vae.encode(waveform.movedim(1, -1))}) -def _save_audio(cls, audio, filename_prefix="ComfyUI", format="flac", quality="128k") -> io.NodeOutput: - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( - filename_prefix, folder_paths.get_output_directory() - ) - - # Prepare metadata dictionary - metadata = {} - if not args.disable_metadata: - if cls.hidden.prompt is not None: - metadata["prompt"] = json.dumps(cls.hidden.prompt) - if cls.hidden.extra_pnginfo is not None: - for x in cls.hidden.extra_pnginfo: - metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) - - # Opus supported sample rates - OPUS_RATES = [8000, 12000, 16000, 24000, 48000] - - results = [] - for batch_number, waveform in enumerate(audio["waveform"].cpu()): - filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) - file = f"{filename_with_batch_num}_{counter:05}_.{format}" - output_path = os.path.join(full_output_folder, file) - - # Use original sample rate initially - sample_rate = audio["sample_rate"] - - # Handle Opus sample rate requirements - if format == "opus": - if sample_rate > 48000: - sample_rate = 48000 - elif sample_rate not in OPUS_RATES: - # Find the next highest supported rate - for rate in sorted(OPUS_RATES): - if rate > sample_rate: - sample_rate = rate - break - if sample_rate not in OPUS_RATES: # Fallback if still not supported - sample_rate = 48000 - - # Resample if necessary - if sample_rate != audio["sample_rate"]: - waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) - - # Create output with specified format - output_buffer = BytesIO() - output_container = av.open(output_buffer, mode="w", format=format) - - # Set metadata on the container - for key, value in metadata.items(): - output_container.metadata[key] = value - - # Set up the output stream with appropriate properties - if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate) - if quality == "64k": - out_stream.bit_rate = 64000 - elif quality == "96k": - out_stream.bit_rate = 96000 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "192k": - out_stream.bit_rate = 192000 - elif quality == "320k": - out_stream.bit_rate = 320000 - elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) - if quality == "V0": - # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool - out_stream.codec_context.qscale = 1 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "320k": - out_stream.bit_rate = 320000 - else: # format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate) - - frame = av.AudioFrame.from_ndarray( - waveform.movedim(0, 1).reshape(1, -1).float().numpy(), - format="flt", - layout="mono" if waveform.shape[0] == 1 else "stereo", - ) - frame.sample_rate = sample_rate - frame.pts = 0 - output_container.mux(out_stream.encode(frame)) - - # Flush encoder - output_container.mux(out_stream.encode(None)) - - # Close containers - output_container.close() - - # Write the output to file - output_buffer.seek(0) - with open(output_path, "wb") as f: - f.write(output_buffer.getbuffer()) - - results.append(ui.SavedResult(file, subfolder, io.FolderType.output)) - counter += 1 - - return io.NodeOutput(ui={"audio": results}) - - NODES_LIST: list[type[io.ComfyNodeV3]] = [ ConditioningStableAudio, EmptyLatentAudio,