Spaces:
Running
Running
import gradio as gr | |
import yt_dlp | |
import os | |
import zipfile | |
import ffmpeg | |
import json | |
from datetime import timedelta | |
def download_video(url, output_dir="downloads"): | |
os.makedirs(output_dir, exist_ok=True) | |
ydl_opts = { | |
'format': 'bestaudio/best', | |
'outtmpl': f'{output_dir}/%(title)s.%(ext)s', | |
'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192'}] | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info_dict = ydl.extract_info(url, download=True) | |
return ydl.prepare_filename(info_dict).replace(info_dict['ext'], 'wav') | |
def parse_srt(srt_content): | |
captions = [] | |
for block in srt_content.strip().split("\n\n"): | |
parts = block.split("\n") | |
if len(parts) >= 3: | |
index = parts[0] | |
time_range = parts[1] | |
text = " ".join(parts[2:]) | |
start_time, end_time = time_range.split(" --> ") | |
captions.append({"start": start_time, "end": end_time, "text": text}) | |
return captions | |
def srt_to_seconds(time_str): | |
h, m, s = time_str.split(':') | |
s, ms = s.split(',') | |
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000 | |
def cut_audio(audio_path, captions, output_dir="tts_dataset"): | |
os.makedirs(output_dir, exist_ok=True) | |
metadata = [] | |
for i, cap in enumerate(captions): | |
start_sec = srt_to_seconds(cap['start']) | |
end_sec = srt_to_seconds(cap['end']) | |
output_path = os.path.join(output_dir, f"clip_{i}.wav") | |
( | |
ffmpeg | |
.input(audio_path, ss=start_sec, to=end_sec) | |
.output(output_path, format='wav') | |
.run(quiet=True, overwrite_output=True) | |
) | |
metadata.append({"file": output_path, "text": cap['text']}) | |
return metadata | |
def zip_dataset(output_dir="tts_dataset", zip_name="tts_data.zip"): | |
zip_path = os.path.join(output_dir, zip_name) | |
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, _, files in os.walk(output_dir): | |
for file in files: | |
zipf.write(os.path.join(root, file), arcname=file) | |
return zip_path | |
def process_data(video_url, audio_file, caption_file): | |
if video_url: | |
audio_path = download_video(video_url) | |
elif audio_file: | |
audio_path = audio_file.name | |
else: | |
return "No valid input provided." | |
captions = parse_srt(caption_file.read().decode("utf-8")) | |
metadata = cut_audio(audio_path, captions) | |
with open("tts_dataset/metadata.json", "w") as f: | |
json.dump(metadata, f, indent=4, ensure_ascii=False) | |
zip_path = zip_dataset() | |
return zip_path | |
demo = gr.Interface( | |
fn=process_data, | |
inputs=[ | |
gr.Textbox(label="YouTube Video URL (optional)"), | |
gr.File(label="Upload Audio File (optional)"), | |
gr.File(label="Upload Caption File (SRT/VTT)") | |
], | |
outputs=gr.File(label="Download Processed ZIP"), | |
title="TTS Dataset Cutter", | |
description="Upload a video/audio file + caption file (SRT/VTT) to extract speech clips for TTS training. Supports YouTube download." | |
) | |
demo.launch() | |