Trim audio to video when saving video. (#9617)

This commit is contained in:
comfyanonymous
2025-08-29 01:12:00 -07:00
committed by GitHub
parent c7bb3e2bce
commit 15aa9222c4

View File

@@ -8,6 +8,7 @@ import av
import io import io
import json import json
import numpy as np import numpy as np
import math
import torch import torch
from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
@@ -282,8 +283,6 @@ class VideoFromComponents(VideoInput):
if self.__components.audio: if self.__components.audio:
audio_sample_rate = int(self.__components.audio['sample_rate']) audio_sample_rate = int(self.__components.audio['sample_rate'])
audio_stream = output.add_stream('aac', rate=audio_sample_rate) audio_stream = output.add_stream('aac', rate=audio_sample_rate)
audio_stream.sample_rate = audio_sample_rate
audio_stream.format = 'fltp'
# Encode video # Encode video
for i, frame in enumerate(self.__components.images): for i, frame in enumerate(self.__components.images):
@@ -298,27 +297,12 @@ class VideoFromComponents(VideoInput):
output.mux(packet) output.mux(packet)
if audio_stream and self.__components.audio: if audio_stream and self.__components.audio:
# Encode audio waveform = self.__components.audio['waveform']
samples_per_frame = int(audio_sample_rate / frame_rate) waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
for i in range(num_frames): frame.sample_rate = audio_sample_rate
start = i * samples_per_frame frame.pts = 0
end = start + samples_per_frame output.mux(audio_stream.encode(frame))
# TODO(Feature) - Add support for stereo audio
chunk = (
self.__components.audio["waveform"][0, 0, start:end]
.unsqueeze(0)
.contiguous()
.numpy()
)
audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
audio_frame.sample_rate = audio_sample_rate
audio_frame.pts = i * samples_per_frame
for packet in audio_stream.encode(audio_frame):
output.mux(packet)
# Flush audio
for packet in audio_stream.encode(None):
output.mux(packet)
# Flush encoder
output.mux(audio_stream.encode(None))