File size: 3,143 Bytes
47dccb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import yt_dlp
import os
import zipfile
import ffmpeg
import json
from datetime import timedelta

def download_video(url, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192'}]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=True)
        return ydl.prepare_filename(info_dict).replace(info_dict['ext'], 'wav')

def parse_srt(srt_content):
    captions = []
    for block in srt_content.strip().split("\n\n"):
        parts = block.split("\n")
        if len(parts) >= 3:
            index = parts[0]
            time_range = parts[1]
            text = " ".join(parts[2:])
            start_time, end_time = time_range.split(" --> ")
            captions.append({"start": start_time, "end": end_time, "text": text})
    return captions

def srt_to_seconds(time_str):
    h, m, s = time_str.split(':')
    s, ms = s.split(',')
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

def cut_audio(audio_path, captions, output_dir="tts_dataset"):
    os.makedirs(output_dir, exist_ok=True)
    metadata = []
    for i, cap in enumerate(captions):
        start_sec = srt_to_seconds(cap['start'])
        end_sec = srt_to_seconds(cap['end'])
        output_path = os.path.join(output_dir, f"clip_{i}.wav")
        (
            ffmpeg
            .input(audio_path, ss=start_sec, to=end_sec)
            .output(output_path, format='wav')
            .run(quiet=True, overwrite_output=True)
        )
        metadata.append({"file": output_path, "text": cap['text']})
    return metadata

def zip_dataset(output_dir="tts_dataset", zip_name="tts_data.zip"):
    zip_path = os.path.join(output_dir, zip_name)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=file)
    return zip_path

def process_data(video_url, audio_file, caption_file):
    if video_url:
        audio_path = download_video(video_url)
    elif audio_file:
        audio_path = audio_file.name
    else:
        return "No valid input provided."
    
    captions = parse_srt(caption_file.read().decode("utf-8"))
    metadata = cut_audio(audio_path, captions)
    with open("tts_dataset/metadata.json", "w") as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)
    
    zip_path = zip_dataset()
    return zip_path

demo = gr.Interface(
    fn=process_data,
    inputs=[
        gr.Textbox(label="YouTube Video URL (optional)"),
        gr.File(label="Upload Audio File (optional)"),
        gr.File(label="Upload Caption File (SRT/VTT)")
    ],
    outputs=gr.File(label="Download Processed ZIP"),
    title="TTS Dataset Cutter",
    description="Upload a video/audio file + caption file (SRT/VTT) to extract speech clips for TTS training. Supports YouTube download."
)

demo.launch()