File size: 7,527 Bytes
5021a0c
 
 
63f1d6d
4337b98
5021a0c
552e1db
e42e13d
0596274
fb91b69
 
284179e
 
 
 
 
7042e46
552e1db
5021a0c
552e1db
 
 
 
 
 
 
 
2f93aef
63f1d6d
2f93aef
 
 
284179e
2f93aef
 
 
fb91b69
 
 
 
 
 
 
 
 
 
2f93aef
 
 
 
 
 
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
ef4c8b8
284179e
 
552e1db
 
2f93aef
 
 
284179e
2f93aef
 
 
 
552e1db
2f93aef
284179e
2f93aef
 
d3fce98
 
 
 
 
 
552e1db
2f93aef
4337b98
5021a0c
284179e
2f93aef
284179e
 
 
 
2f93aef
 
 
 
 
 
 
284179e
 
 
 
2f93aef
284179e
 
 
2f93aef
284179e
2f93aef
5021a0c
552e1db
 
 
 
 
 
 
284179e
 
5021a0c
552e1db
 
5021a0c
92f530c
5eeb00f
5021a0c
 
 
 
 
552e1db
 
2f93aef
5021a0c
 
92f530c
5021a0c
552e1db
5021a0c
 
 
284179e
5021a0c
2f93aef
5021a0c
 
 
 
 
 
 
 
552e1db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re  # Import the regular expression module
from pathlib import Path

# At the top of your file:
SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function for a single paragraph with SS handling
async def paragraph_to_speech(text, voice, rate, pitch):
    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
    voice1F ="en-US-EmmaNeural - en-US (Female)"
    voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
    voice2F = "en-US-JennyNeural - en-US (Female)"
    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
    voice3F = "en-HK-YanNeural - en-HK (Female)"
    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    silence_durations = []
    parts = re.split(r'(SS\d+\.?\d*)', text)
    for part in parts:
        if re.match(r'SS\d+\.?\d*', part):
            if SILENCE_PATH.exists():
                audio_segments.append(str(SILENCE_PATH))
                print(f"Silence added at {SILENCE_PATH}")
            else:
                # Create silent segment programmatically
                silent_audio = AudioSegment.silent(duration=1000)  # 1 second
                with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                    silent_audio.export(tmp_file.name, format="mp3")
                    audio_segments.append(tmp_file.name)
                    print(f"Created silent segment at {tmp_file.name}")
        elif part.strip():
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch
            if part.startswith("1F"):
                processed_text = part[2:]
                current_voice = voice1F.split(" - ")[0]
            elif part.startswith("2F"):
                processed_text = part[2:]
                current_voice = voice2F.split(" - ")[0]
            elif part.startswith("3F"):
                processed_text = part[2:]
                current_voice = voice3F.split(" - ")[0]
            elif part.startswith("1M"):
                processed_text = part[2:]
                current_voice = voice1.split(" - ")[0]
            elif part.startswith("2M"):
                processed_text = part[2:]
                current_voice = voice2.split(" - ")[0]
            elif part.startswith("3M"):
                processed_text = part[2:]
                current_voice = voice3.split(" - ")[0]
            elif part.startswith("1C"):
                processed_text = part[2:]
                current_voice = voice4.split(" - ")[0]
            elif part.startswith("1O"):
                processed_text = part[2:]
                current_voice = voice5.split(" - ")[0]
                current_pitch = -30
                current_rate = -20
            else:
                # Use selected voice, or fallback to default
                #voice_short_name = (voice or default_voice).split(" - ")[0]
                current_voice = (voice or default_voice).split(" - ")[0]
                processed_text=part[:]
            rate_str = f"{current_rate:+d}%"
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None) # Empty string

    return audio_segments, silence_durations

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            for i, path in enumerate(audio_paths):
                final_audio_segments.append(path)
                if i < len(silence_times):
                    final_audio_segments.append(silence_times[i])

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".mp3")
    with open(combined_audio_path, 'wb') as outfile:
        for segment in final_audio_segments:
            if isinstance(segment, str):
                try:
                    with open(segment, 'rb') as infile:
                        outfile.write(infile.read())
                    os.remove(segment)  # Clean up individual files
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment}")
    return combined_audio_path, None

# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, warning

# Create Gradio application
import gradio as gr

async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
    You can insert silence using the marker 'SS' (This will insert a Silence period from the Silence.mp3 file).
    Enter your text, select a voice, and adjust the speech rate and pitch.
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()