Spaces:
Sleeping
Sleeping
File size: 7,753 Bytes
5021a0c 63f1d6d 4337b98 5021a0c 4bccf88 e42e13d a4e47b1 0596274 44c7b6f 4bccf88 44c7b6f 4bccf88 44c7b6f 284179e 7042e46 4bccf88 d3fce98 4bccf88 4337b98 4bccf88 2f93aef 4bccf88 2f93aef 4bccf88 2f93aef 284179e 4bccf88 2f93aef 5021a0c 552e1db 4bccf88 552e1db 5021a0c 552e1db 4bccf88 5021a0c 4bccf88 5021a0c 552e1db 4bccf88 5021a0c 92f530c 5021a0c 552e1db 5021a0c 4bccf88 5021a0c 552e1db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
from pathlib import Path
from pydub import AudioSegment
def get_silence(duration_ms=1000):
# Create silent audio segment with specified parameters
silent_audio = AudioSegment.silent(
duration=duration_ms,
frame_rate=24000 # 24kHz sampling rate
)
# Set audio parameters
silent_audio = silent_audio.set_channels(1) # Mono
silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
# Export with specific bitrate and codec parameters
silent_audio.export(
tmp_file.name,
format="mp3",
bitrate="48k",
parameters=[
"-ac", "1", # Mono
"-ar", "24000", # Sample rate
"-sample_fmt", "s32", # 32-bit samples
"-codec:a", "libmp3lame" # MP3 codec
]
)
return tmp_file.name
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
async def process_transcript_line(line, voice, rate, pitch):
"""Processes a single transcript line to extract time, voice commands, and generate audio."""
match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', line)
if match:
minutes, seconds, milliseconds_str, text_with_commands = match.groups()
start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
if not text_with_commands.strip():
return start_time_ms, None
current_voice = voice
current_rate = rate
current_pitch = pitch
processed_text = text_with_commands
voice1 = "en-AU-WilliamNeural - en-AU (Male)"
voice1F ="en-GB-SoniaNeural - en-GB (Female)"
voice2 = "en-GB-RyanNeural - en-GB (Male)"
voice2F = "en-US-JennyNeural - en-US (Female)"
voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
voice3F = "en-HK-YanNeural - en-HK (Female)"
voice4 = "en-GB-ThomasNeural - en-GB (Male)"
voice4F ="en-US-EmmaNeural - en-US (Female)"
voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
voice6 = "en-GB-MaisieNeural - en-GB (Female)" #Child
if text_with_commands.startswith("1F"):
current_voice = voice1F.split(" - ")[0]
current_pitch = 25
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("2F"):
current_voice = voice2F.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("3F"):
current_voice = voice3F.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("4F"):
current_voice = voice4F.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("1M"):
current_voice = voice1.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("2M"):
current_voice = voice2.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("3M"):
current_voice = voice3.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("4M"):
current_voice = voice4.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("1O"): # Old man voice
current_voice = voice5.split(" - ")[0]
current_pitch = -20
current_rate = -10
processed_text = text_with_commands[2:].strip()
elif text_with_commands.startswith("1C"): #Child voice
current_voice = voice6.split(" - ")[0]
processed_text = text_with_commands[2:].strip()
rate_str = f"{current_rate:+d}%"
pitch_str = f"{current_pitch:+d}Hz"
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
audio_path = tmp_file.name
await communicate.save(audio_path)
return start_time_ms, audio_path
return None, None
async def transcript_to_speech(transcript_text, voice, rate, pitch):
if not transcript_text.strip():
return None, gr.Warning("Please enter transcript text.")
if not voice:
return None, gr.Warning("Please select a voice.")
lines = transcript_text.strip().split('\n')
audio_segments_with_time = []
max_end_time_ms = 0
for line in lines:
start_time, audio_path = await process_transcript_line(line, voice, rate, pitch)
if start_time is not None and audio_path:
audio = AudioSegment.from_mp3(audio_path)
audio_segments_with_time.append({'start': start_time, 'audio': audio, 'path': audio_path})
max_end_time_ms = max(max_end_time_ms, start_time + len(audio))
elif audio_path:
os.remove(audio_path) # Clean up even if no timestamp
if not audio_segments_with_time:
return None, "No valid transcript lines found."
# Create initial silence audio
final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
for segment in audio_segments_with_time:
final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
os.remove(segment['path']) # Clean up individual audio files
combined_audio_path = tempfile.mktemp(suffix=".mp3")
final_audio.export(combined_audio_path, format="mp3")
return combined_audio_path, None
@spaces.GPU
def tts_interface(transcript, voice, rate, pitch):
audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
return audio, warning
async def create_demo():
voices = await get_voices()
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
description = """
Process YouTube transcript text with timestamps to generate synchronized audio.
Each line should be in the format: `minutes:seconds[.milliseconds] text`.
Voice prefixes (e.g., 1F, 1C) can be used at the beginning of a line to switch voices.
Example:
```
0:00 This
0:14 is the story of little Red Riding Hood
0:38 1F Grandma isn’t feeling very well.
0:48 1C Yes, said Little Red Riding Hood.
```
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="YouTube Transcript", lines=10, placeholder="0:00 This\n0:14 is the story...\n0:38 1F Grandma..."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Markdown(label="Warning", visible=False)
],
title="TTS for YouTube Transcripts with Voice Switching",
description=description,
analytics_enabled=False,
allow_flagging=False
)
return demo
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch() |