import gradio as gr import yt_dlp import os import zipfile import ffmpeg import json from datetime import timedelta def download_video(url, output_dir="downloads"): os.makedirs(output_dir, exist_ok=True) ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': f'{output_dir}/%(title)s.%(ext)s', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192'}] } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=True) return ydl.prepare_filename(info_dict).replace(info_dict['ext'], 'wav') def parse_srt(srt_content): captions = [] for block in srt_content.strip().split("\n\n"): parts = block.split("\n") if len(parts) >= 3: index = parts[0] time_range = parts[1] text = " ".join(parts[2:]) start_time, end_time = time_range.split(" --> ") captions.append({"start": start_time, "end": end_time, "text": text}) return captions def srt_to_seconds(time_str): h, m, s = time_str.split(':') s, ms = s.split(',') return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000 def cut_audio(audio_path, captions, output_dir="tts_dataset"): os.makedirs(output_dir, exist_ok=True) metadata = [] for i, cap in enumerate(captions): start_sec = srt_to_seconds(cap['start']) end_sec = srt_to_seconds(cap['end']) output_path = os.path.join(output_dir, f"clip_{i}.wav") ( ffmpeg .input(audio_path, ss=start_sec, to=end_sec) .output(output_path, format='wav') .run(quiet=True, overwrite_output=True) ) metadata.append({"file": output_path, "text": cap['text']}) return metadata def zip_dataset(output_dir="tts_dataset", zip_name="tts_data.zip"): zip_path = os.path.join(output_dir, zip_name) with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(output_dir): for file in files: zipf.write(os.path.join(root, file), arcname=file) return zip_path def process_data(video_url, audio_file, caption_file): if video_url: audio_path = download_video(video_url) elif audio_file: audio_path = audio_file.name else: return "No valid input provided." captions = parse_srt(caption_file.read().decode("utf-8")) metadata = cut_audio(audio_path, captions) with open("tts_dataset/metadata.json", "w") as f: json.dump(metadata, f, indent=4, ensure_ascii=False) zip_path = zip_dataset() return zip_path demo = gr.Interface( fn=process_data, inputs=[ gr.Textbox(label="YouTube Video URL (optional)"), gr.File(label="Upload Audio File (optional)"), gr.File(label="Upload Caption File (SRT/VTT)") ], outputs=gr.File(label="Download Processed ZIP"), title="TTS Dataset Cutter", description="Upload a video/audio file + caption file (SRT/VTT) to extract speech clips for TTS training. Supports YouTube download." ) demo.launch()