File size: 9,574 Bytes
5021a0c
 
 
63f1d6d
4337b98
5021a0c
552e1db
e42e13d
a4e47b1
0596274
44c7b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284179e
 
 
 
 
7042e46
552e1db
5021a0c
552e1db
3e534e0
 
f1b4f1f
aa5ea31
552e1db
 
 
 
 
2f93aef
63f1d6d
2f93aef
 
 
284179e
2f93aef
 
aa5ea31
 
 
 
a86bdd5
aa5ea31
 
e4e3d3e
 
9733186
7a3f365
e4e3d3e
9733186
7a3f365
 
 
 
fdb31c0
7a3f365
 
 
2f93aef
 
 
 
 
 
 
552e1db
3e534e0
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
3e534e0
ef4c8b8
284179e
 
552e1db
 
2f93aef
aa5ea31
 
 
2f93aef
 
284179e
2f93aef
 
 
 
552e1db
2f93aef
284179e
2f93aef
 
d3fce98
 
 
 
 
 
f067030
 
 
2f93aef
4337b98
5021a0c
284179e
2f93aef
284179e
 
 
 
2f93aef
 
 
 
 
 
 
284179e
 
 
 
2f93aef
284179e
 
 
2f93aef
284179e
2f93aef
5021a0c
552e1db
 
 
 
 
 
 
284179e
 
5021a0c
552e1db
 
5021a0c
92f530c
5eeb00f
aa5ea31
5021a0c
 
 
 
552e1db
 
2f93aef
5021a0c
 
92f530c
5021a0c
552e1db
5021a0c
 
 
284179e
5021a0c
2f93aef
5021a0c
 
 
 
 
 
 
 
552e1db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re  # Import the regular expression module
from pathlib import Path
from pydub import AudioSegment

def get_silence(duration_ms=1000):
    # Create silent audio segment with specified parameters
    silent_audio = AudioSegment.silent(
        duration=duration_ms,
        frame_rate=24000  # 24kHz sampling rate
    )
    
    # Set audio parameters
    silent_audio = silent_audio.set_channels(1)  # Mono
    silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        # Export with specific bitrate and codec parameters
        silent_audio.export(
            tmp_file.name,
            format="mp3",
            bitrate="48k",
            parameters=[
                "-ac", "1",  # Mono
                "-ar", "24000",  # Sample rate
                "-sample_fmt", "s32",  # 32-bit samples
                "-codec:a", "libmp3lame"  # MP3 codec
            ]
        )
        return tmp_file.name

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function for a single paragraph with SS handling
async def paragraph_to_speech(text, voice, rate, pitch):
    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
    #voice1F ="en-US-EmmaNeural - en-US (Female)"
    voice1F ="en-GB-SoniaNeural - en-GB (Female)"
    #voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
    voice2 = "en-GB-RyanNeural - en-GB (Male)"
    voice2F = "en-US-JennyNeural - en-US (Female)"
    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
    voice3F = "en-HK-YanNeural - en-HK (Female)"
    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    silence_durations = []
    parts = re.split(r'(SS\d+\.?\d*)', text)
    for part in parts:
        if (re.search(r'-?\d+', part)):   #if there are any digit following the voice tag, eg 1F20 or 1F-20 
            match = re.search(r'-?\d+', part)
            pitch = match.group()         #Set pitch to set value as noted in the tag
            # Remove only the first integer found
            part = re.sub(r'-?\d+', '', part, count=1).strip()  #cut out the pitch int from text part
             
        if re.match(r'SS\d+\.?\d*', part):  #Check if there is Silence tag
            # At the top of your file:
            #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
            # At the top of your file (assuming you uploaded "Silence.mp3" to root)
            #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
            # At the top of your file:
            #SILENCE_PATH = Path(__file__).parent.absolute() / "static" / "intro.mp3"
            #if SILENCE_PATH.exists():
            #    audio_segments.append(str(SILENCE_PATH))
            #    print(f"Silence.mp3 file found at {SILENCE_PATH} and is inserted")
            #else:
            silence_duration = float(part[2:]) * 1000  # Convert to milliseconds
            print(f"Silence.mp3 file NOT FOUND")
            silence_file_path = get_silence(silence_duration)  # Store the returned filename
            audio_segments.append(silence_file_path)  # Use the stored filename
        elif part.strip():
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch
            if part.startswith("1F"):
                processed_text = part[2:]
                current_voice = voice1F.split(" - ")[0]
                current_pitch = 25
            elif part.startswith("2F"):
                processed_text = part[2:]
                current_voice = voice2F.split(" - ")[0]
            elif part.startswith("3F"):
                processed_text = part[2:]
                current_voice = voice3F.split(" - ")[0]
            elif part.startswith("1M"):
                processed_text = part[2:]
                current_voice = voice1.split(" - ")[0]
            elif part.startswith("2M"):
                processed_text = part[2:]
                current_voice = voice2.split(" - ")[0]
            elif part.startswith("3M"):
                processed_text = part[2:]
                current_voice = voice3.split(" - ")[0]
            elif part.startswith("1C"):
                processed_text = part[2:]
                current_voice = voice4.split(" - ")[0]
            elif part.startswith("1O"):
                processed_text = part[2:]
                current_voice = voice5.split(" - ")[0]
                current_pitch = -30
                current_rate = -15
            else:
                # Use selected voice, or fallback to default
                #voice_short_name = (voice or default_voice).split(" - ")[0]
                current_voice = (voice or default_voice).split(" - ")[0]
                processed_text=part[:]
            rate_str = f"{current_rate:+d}%"
            if  part[2:4].isdigit():
                processed_text = part[4:]
                pitch = int(part[2:4])
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None) # Empty string

    return audio_segments, silence_durations

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    #paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
    # Split the text using straight quotes (") and curly quotes (โ€œ and โ€)
    paragraphs = [p.strip() for p in re.split(r'[โ€œโ€"]', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            for i, path in enumerate(audio_paths):
                final_audio_segments.append(path)
                if i < len(silence_times):
                    final_audio_segments.append(silence_times[i])

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".mp3")
    with open(combined_audio_path, 'wb') as outfile:
        for segment in final_audio_segments:
            if isinstance(segment, str):
                try:
                    with open(segment, 'rb') as infile:
                        outfile.write(infile.read())
                    os.remove(segment)  # Clean up individual files
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment}")
    return combined_audio_path, None

# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, warning

# Create Gradio application
import gradio as gr

async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # ๐Ÿ‘ˆ Pick one of the available voices
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
    You can insert silence using the marker 'SS' (This will insert a Silence period from the Silence.mp3 file).
    Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20  or 1M24
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()