This is not needed. (#8991)

This commit is contained in:
comfyanonymous 2025-07-21 13:48:25 -07:00 committed by GitHub
parent 5249e45a1c
commit 0aa1c58b04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -288,7 +288,7 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
return wav.float() / (2 ** 31) return wav.float() / (2 ** 31)
raise ValueError(f"Unsupported wav dtype: {wav.dtype}") raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]: def load(filepath: str) -> tuple[torch.Tensor, int]:
with av.open(filepath) as af: with av.open(filepath) as af:
if not af.streams.audio: if not af.streams.audio:
raise ValueError("No audio stream found in the file.") raise ValueError("No audio stream found in the file.")
@ -297,40 +297,20 @@ def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[to
sr = stream.codec_context.sample_rate sr = stream.codec_context.sample_rate
n_channels = stream.channels n_channels = stream.channels
seek_time = frame_offset / sr if frame_offset > 0 else 0.0
duration = num_frames / sr if num_frames > 0 else -1.0
sample_offset = int(sr * seek_time)
num_samples = int(sr * duration) if duration >= 0 else -1
# Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary'
seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time
af.seek(int(seek_sec / stream.time_base), stream=stream)
frames = [] frames = []
length = 0 length = 0
for frame in af.decode(streams=stream.index): for frame in af.decode(streams=stream.index):
current_offset = int(frame.rate * frame.pts * frame.time_base)
strip = max(0, sample_offset - current_offset)
buf = torch.from_numpy(frame.to_ndarray()) buf = torch.from_numpy(frame.to_ndarray())
if buf.shape[0] != n_channels: if buf.shape[0] != n_channels:
buf = buf.view(-1, n_channels).t() buf = buf.view(-1, n_channels).t()
buf = buf[:, strip:]
frames.append(buf) frames.append(buf)
length += buf.shape[1] length += buf.shape[1]
if num_samples > 0 and length >= num_samples:
break
if not frames: if not frames:
raise ValueError("No audio frames decoded.") raise ValueError("No audio frames decoded.")
wav = torch.cat(frames, dim=1) wav = torch.cat(frames, dim=1)
if num_samples > 0:
wav = wav[:, :num_samples]
wav = f32_pcm(wav) wav = f32_pcm(wav)
return wav, sr return wav, sr