Spaces:
Running
Running
File size: 3,143 Bytes
47dccb2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import yt_dlp
import os
import zipfile
import ffmpeg
import json
from datetime import timedelta
def download_video(url, output_dir="downloads"):
os.makedirs(output_dir, exist_ok=True)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192'}]
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=True)
return ydl.prepare_filename(info_dict).replace(info_dict['ext'], 'wav')
def parse_srt(srt_content):
captions = []
for block in srt_content.strip().split("\n\n"):
parts = block.split("\n")
if len(parts) >= 3:
index = parts[0]
time_range = parts[1]
text = " ".join(parts[2:])
start_time, end_time = time_range.split(" --> ")
captions.append({"start": start_time, "end": end_time, "text": text})
return captions
def srt_to_seconds(time_str):
h, m, s = time_str.split(':')
s, ms = s.split(',')
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
def cut_audio(audio_path, captions, output_dir="tts_dataset"):
os.makedirs(output_dir, exist_ok=True)
metadata = []
for i, cap in enumerate(captions):
start_sec = srt_to_seconds(cap['start'])
end_sec = srt_to_seconds(cap['end'])
output_path = os.path.join(output_dir, f"clip_{i}.wav")
(
ffmpeg
.input(audio_path, ss=start_sec, to=end_sec)
.output(output_path, format='wav')
.run(quiet=True, overwrite_output=True)
)
metadata.append({"file": output_path, "text": cap['text']})
return metadata
def zip_dataset(output_dir="tts_dataset", zip_name="tts_data.zip"):
zip_path = os.path.join(output_dir, zip_name)
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(output_dir):
for file in files:
zipf.write(os.path.join(root, file), arcname=file)
return zip_path
def process_data(video_url, audio_file, caption_file):
if video_url:
audio_path = download_video(video_url)
elif audio_file:
audio_path = audio_file.name
else:
return "No valid input provided."
captions = parse_srt(caption_file.read().decode("utf-8"))
metadata = cut_audio(audio_path, captions)
with open("tts_dataset/metadata.json", "w") as f:
json.dump(metadata, f, indent=4, ensure_ascii=False)
zip_path = zip_dataset()
return zip_path
demo = gr.Interface(
fn=process_data,
inputs=[
gr.Textbox(label="YouTube Video URL (optional)"),
gr.File(label="Upload Audio File (optional)"),
gr.File(label="Upload Caption File (SRT/VTT)")
],
outputs=gr.File(label="Download Processed ZIP"),
title="TTS Dataset Cutter",
description="Upload a video/audio file + caption file (SRT/VTT) to extract speech clips for TTS training. Supports YouTube download."
)
demo.launch()
|