File size: 9,402 Bytes
5021a0c
 
 
63f1d6d
4337b98
5021a0c
552e1db
7042e46
2928892
 
 
 
552e1db
 
 
 
 
 
 
 
 
 
 
 
2928892
7042e46
0596274
2928892
 
 
 
 
 
 
7042e46
552e1db
5021a0c
552e1db
 
 
 
 
 
 
 
2f93aef
63f1d6d
2f93aef
 
 
7042e46
2f93aef
 
 
 
 
 
552e1db
2928892
 
2f93aef
 
 
 
 
 
 
 
 
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
552e1db
2f93aef
 
ef4c8b8
552e1db
 
2f93aef
 
 
2928892
2f93aef
 
 
 
552e1db
2f93aef
552e1db
2f93aef
 
d3fce98
 
 
 
 
 
552e1db
2f93aef
4337b98
5021a0c
2928892
2f93aef
2928892
2f93aef
 
 
 
 
 
 
2928892
 
 
 
 
 
 
 
 
2f93aef
2928892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f93aef
2928892
2f93aef
 
5021a0c
552e1db
 
 
 
 
 
 
 
 
 
5021a0c
552e1db
5021a0c
552e1db
 
5021a0c
92f530c
2f93aef
5021a0c
 
 
 
 
552e1db
 
2f93aef
5021a0c
 
92f530c
5021a0c
552e1db
5021a0c
 
 
2928892
5021a0c
2f93aef
5021a0c
 
 
 
 
 
 
 
552e1db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re  # Import the regular expression module
import struct
import wave

# Function to create a temporary silent WAV file
def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
    """Creates a temporary WAV file containing silence.

    Args:
        duration (float): Duration of silence in seconds.
        temp_dir (str): Directory to save the temporary file.
        sample_rate (int): Sample rate of the audio (samples per second).
        num_channels (int): Number of audio channels (1 for mono, 2 for stereo).
        sample_width (int): Sample width in bytes (e.g., 2 for 16-bit).

    Returns:
        str: Path to the temporary silent WAV file.
    """
    num_frames = int(duration * sample_rate)
    silent_data = b'\x00' * (num_frames * num_channels * sample_width)

    temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
    with wave.open(temp_wav_path, 'w') as wf:
        wf.setnchannels(num_channels)
        wf.setframerate(sample_rate)
        wf.setsampwidth(sample_width)
        wf.writeframes(silent_data)
    return temp_wav_path

# Text-to-speech function for a single paragraph with SS handling
async def paragraph_to_speech(text, voice, rate, pitch):
    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
    voice1F ="en-US-EmmaNeural - en-US (Female)"
    voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
    voice2F = "en-US-JennyNeural - en-US (Female)"
    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
    voice3F = "en-HK-YanNeural - en-HK (Female)"
    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    temp_dir = tempfile.gettempdir()
    parts = re.split(r'(SS\d+\.?\d*)', text)

    for part in parts:
        if re.match(r'SS\d+\.?\d*', part):
            try:
                silence_duration = float(part[2:])
                # Assuming default WAV parameters for silence
                silent_wav_path = create_silent_wav(silence_duration, temp_dir)
                audio_segments.append(silent_wav_path)
            except ValueError:
                print(f"Warning: Invalid silence duration format: {part}")
        elif part.strip():
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch

            if part.startswith("1F"):
                processed_text = part[2:]
                current_voice = voice1F.split(" - ")[0]
            elif part.startswith("2F"):
                processed_text = part[2:]
                current_voice = voice2F.split(" - ")[0]
            elif part.startswith("3F"):
                processed_text = part[2:]
                current_voice = voice3F.split(" - ")[0]
            elif part.startswith("1M"):
                processed_text = part[2:]
                current_voice = voice1.split(" - ")[0]
            elif part.startswith("2M"):
                processed_text = part[2:]
                current_voice = voice2.split(" - ")[0]
            elif part.startswith("3M"):
                processed_text = part[2:]
                current_voice = voice3.split(" - ")[0]
            elif part.startswith("1C"):
                processed_text = part[2:]
                current_voice = voice4.split(" - ")[0]
            elif part.startswith("1O"):
                processed_text = part[2:]
                current_voice = voice5.split(" - ")[0]
                current_pitch = -30
                current_rate = -20
            else:
                current_voice = (voice or default_voice).split(" - ")[0]
                processed_text=part[:]
            rate_str = f"{current_rate:+d}%"
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None) # Empty string

    return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            final_audio_segments.extend(audio_paths)

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".wav")
    with wave.open(combined_audio_path, 'w') as outfile:
        first_audio = True
        sample_rate = None
        num_channels = None
        sample_width = None

        for segment_path in final_audio_segments:
            if isinstance(segment_path, str):
                try:
                    with wave.open(segment_path, 'rb') as infile:
                        current_num_channels = infile.getnchannels()
                        current_sample_rate = infile.getframerate()
                        current_sample_width = infile.getsampwidth()
                        frames = infile.readframes(infile.getnframes())

                        if first_audio:
                            num_channels = current_num_channels
                            sample_rate = current_sample_rate
                            sample_width = current_sample_width
                            outfile.setnchannels(num_channels)
                            outfile.setframerate(sample_rate)
                            outfile.setsampwidth(sample_width)
                            first_audio = False
                        elif (current_num_channels != num_channels or
                              current_sample_rate != sample_rate or
                              current_sample_width != sample_width):
                            print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
                            continue

                        outfile.writeframes(frames)
                    os.remove(segment_path)  # Clean up individual files
                except wave.Error as e:
                    print(f"Warning: Error reading WAV file {segment_path}: {e}")
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment_path}")

    return combined_audio_path, None

# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, warning

async def get_voices():
    voices_list = await edge_tts.list_voices()
    voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
    return voices_dict

# Create Gradio application
async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
    You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
    Enter your text, select a voice, and adjust the speech rate and pitch.
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()