File size: 9,477 Bytes
5021a0c
 
 
63f1d6d
4337b98
5021a0c
4bccf88
e42e13d
a4e47b1
0596274
44c7b6f
eae282d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284179e
 
 
eae282d
 
27bebc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bccf88
 
 
 
d3fce98
 
 
27bebc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5021a0c
552e1db
4bccf88
 
552e1db
 
5021a0c
552e1db
4bccf88
5021a0c
27bebc1
 
 
4bccf88
 
27bebc1
 
 
 
 
4bccf88
5021a0c
 
552e1db
 
27bebc1
5021a0c
 
92f530c
5021a0c
552e1db
5021a0c
 
 
27bebc1
5021a0c
 
 
 
 
 
 
 
552e1db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
from pathlib import Path
from pydub import AudioSegment

def get_silence(duration_ms=1000):
    # Create silent audio segment with specified parameters
    silent_audio = AudioSegment.silent(
        duration=duration_ms,
        frame_rate=24000  # 24kHz sampling rate
    )

    # Set audio parameters
    silent_audio = silent_audio.set_channels(1)  # Mono
    silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        # Export with specific bitrate and codec parameters
        silent_audio.export(
            tmp_file.name,
            format="mp3",
            bitrate="48k",
            parameters=[
                "-ac", "1",  # Mono
                "-ar", "24000",  # Sample rate
                "-sample_fmt", "s32",  # 32-bit samples
                "-codec:a", "libmp3lame"  # MP3 codec
            ]
        )
        return tmp_file.name

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

async def text_to_speech_segment(text_segment, voice, rate, pitch):
    """Processes a single text segment for voice commands and generates audio."""
    current_voice_full = voice
    current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
    current_rate = rate
    current_pitch = pitch
    processed_text = text_segment

    voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
    voice1_short = voice1_full.split(" - ")[0]
    voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
    voice1F_short = voice1F_full.split(" - ")[0]
    voice2_full = "en-GB-RyanNeural - en-GB (Male)"
    voice2_short = voice2_full.split(" - ")[0]
    voice2F_full = "en-US-JennyNeural - en-US (Female)"
    voice2F_short = voice2F_full.split(" - ")[0]
    voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
    voice3_short = voice3_full.split(" - ")[0]
    voice3F_full = "en-HK-YanNeural - en-HK (Female)"
    voice3F_short = voice3F_full.split(" - ")[0]
    voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
    voice4_short = voice4_full.split(" - ")[0]
    voice4F_full ="en-US-EmmaNeural - en-US (Female)"
    voice4F_short = voice4F_full.split(" - ")[0]
    voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
    voice5_short = voice5_full.split(" - ")[0]
    voice6_full = "en-GB-MaisieNeural - en-GB (Female)"  #Child
    voice6_short = voice6_full.split(" - ")[0]

    if text_segment.startswith("1F"):
        current_voice_short = voice1F_short
        current_pitch = 25
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("2F"):
        current_voice_short = voice2F_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("3F"):
        current_voice_short = voice3F_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("4F"):
        current_voice_short = voice4F_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("1M"):
        current_voice_short = voice1_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("2M"):
        current_voice_short = voice2_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("3M"):
        current_voice_short = voice3_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("4M"):
        current_voice_short = voice4_short
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("1O"):  # Old man voice
        current_voice_short = voice5_short
        current_pitch = -20
        current_rate = -10
        processed_text = text_segment[2:].strip()
    elif text_segment.startswith("1C"):  #Child voice
        current_voice_short = voice6_short
        processed_text = text_segment[2:].strip()

    rate_str = f"{current_rate:+d}%"
    pitch_str = f"{current_pitch:+d}Hz"
    communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        audio_path = tmp_file.name
        await communicate.save(audio_path)
    return audio_path

async def transcript_to_speech(transcript_text, voice, rate, pitch):
    if not transcript_text.strip():
        return None, gr.Warning("Please enter transcript text.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    segments = re.split(r'[“”"]', transcript_text)
    audio_paths = []

    for segment in segments:
        segment = segment.strip()
        if segment:
            # Check if the segment starts with a timestamp
            timestamp_match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', segment)
            if timestamp_match:
                minutes, seconds, milliseconds_str, text_with_commands = timestamp_match.groups()
                start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
                audio_path = await text_to_speech_segment(text_with_commands, voice, rate, pitch)
                audio_paths.append({'start': start_time_ms, 'path': audio_path})
            else:
                # Process segments without timestamps (for voice switching)
                audio_path = await text_to_speech_segment(segment, voice, rate, pitch)
                if audio_path:
                    audio_paths.append({'start': None, 'path': audio_path}) # No specific start time

    if not audio_paths:
        return None, "No audio segments generated."

    # Handle combining audio with timestamps
    timed_segments = [item for item in audio_paths if item['start'] is not None]
    non_timed_segments = [item for item in audio_paths if item['start'] is None and item['path']]

    if timed_segments:
        max_end_time_ms = 0
        processed_timed_segments = []
        for item in timed_segments:
            audio = AudioSegment.from_mp3(item['path'])
            processed_timed_segments.append({'start': item['start'], 'audio': audio, 'path': item['path']})
            max_end_time_ms = max(max_end_time_ms, item['start'] + len(audio))

        final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
        for segment in processed_timed_segments:
            final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
            os.remove(segment['path'])

        # Append non-timed segments sequentially
        for item in non_timed_segments:
            audio = AudioSegment.from_mp3(item['path'])
            final_audio += audio
            os.remove(item['path'])

        combined_audio_path = tempfile.mktemp(suffix=".mp3")
        final_audio.export(combined_audio_path, format="mp3")
        return combined_audio_path, None

    elif non_timed_segments:
        # Combine non-timed segments sequentially if no timestamps are found
        combined_audio = AudioSegment.empty()
        for item in non_timed_segments:
            audio = AudioSegment.from_mp3(item['path'])
            combined_audio += audio
            os.remove(item['path'])
        combined_audio_path = tempfile.mktemp(suffix=".mp3")
        combined_audio.export(combined_audio_path, format="mp3")
        return combined_audio_path, None

    return None, "No processable audio segments found."

@spaces.GPU
def tts_interface(transcript, voice, rate, pitch):
    audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
    return audio, warning

async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
    description = """
    Process text, handling both timestamped transcripts and voice switching using quote marks and prefixes.
    Separate segments by quote marks ("). For timestamped segments, use the format: `minutes:seconds[.milliseconds] text`.
    Voice prefixes (e.g., 1F, 1C) can be used at the beginning of a quoted segment to switch voices.
    Example:
    ```
    0:00 "This"
    "0:14 is the story of little Red Riding Hood"
    "0:38 1F Grandma isn’t feeling very well."
    "0:48 1C Yes, said Little Red Riding Hood."
    "and then the default voice continues"
    ```
    """
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text / Transcript", lines=10, placeholder='0:00 "This"\n"0:14 is the story..."\n"1F Hello"'),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Combined TTS: Timestamps and Voice Switching",
        description=description,
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()