File size: 8,845 Bytes
5021a0c
 
 
63f1d6d
4337b98
5021a0c
2928892
7042e46
2928892
 
 
 
 
 
 
 
 
7042e46
0596274
2928892
 
 
 
 
 
 
7042e46
2928892
5021a0c
2928892
 
 
 
 
 
 
 
 
 
2f93aef
63f1d6d
2f93aef
 
 
7042e46
2f93aef
 
 
 
 
 
2928892
 
2f93aef
 
 
 
 
 
 
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
2928892
2f93aef
 
ef4c8b8
2928892
 
 
2f93aef
 
 
2928892
 
 
2f93aef
 
 
 
2928892
2f93aef
2928892
2f93aef
 
d3fce98
 
 
 
 
 
2928892
2f93aef
4337b98
5021a0c
2928892
2f93aef
2928892
2f93aef
 
 
 
 
 
 
2928892
 
 
 
 
 
 
 
 
2f93aef
2928892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f93aef
2928892
2f93aef
 
5021a0c
2928892
 
 
5021a0c
2928892
5021a0c
40b007d
2928892
5021a0c
92f530c
2f93aef
5021a0c
 
 
 
 
2928892
 
2f93aef
5021a0c
 
92f530c
5021a0c
2928892
5021a0c
 
 
2928892
5021a0c
2f93aef
5021a0c
 
 
 
 
 
 
 
2928892
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
import struct
import wave

# Function to create a temporary silent WAV file
def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
    """Creates a temporary WAV file containing silence."""
    if duration <= 0:
        raise ValueError("Duration must be positive.")
    
    num_frames = int(duration * sample_rate)
    silent_data = b'\x00' * (num_frames * num_channels * sample_width)

    temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
    with wave.open(temp_wav_path, 'w') as wf:
        wf.setnchannels(num_channels)
        wf.setframerate(sample_rate)
        wf.setsampwidth(sample_width)
        wf.writeframes(silent_data)
    return temp_wav_path

# Function to process text and generate audio for a single paragraph
async def paragraph_to_speech(text, voice, rate, pitch):
    voices = {
        "voice1F": "en-US-EmmaNeural - en-US (Female)",
        "voice2F": "en-US-JennyNeural - en-US (Female)",
        "voice3F": "en-HK-YanNeural - en-HK (Female)",
        "voice1": "en-AU-WilliamNeural - en-AU (Male)",
        "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
        "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
        "voice4": "en-GB-MaisieNeural - en-GB (Female)",  # Child
        "voice5": "en-GB-RyanNeural - en-GB (Male)"  # Old Man
    }

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    temp_dir = tempfile.gettempdir()
    parts = re.split(r'(SS\d+\.?\d*)', text)

    for part in parts:
        if re.match(r'SS\d+\.?\d*', part):
            try:
                silence_duration = float(part[2:])
                silent_wav_path = create_silent_wav(silence_duration, temp_dir)
                audio_segments.append(silent_wav_path)
            except ValueError:
                print(f"Warning: Invalid silence duration format: {part}")
        elif part.strip():
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch

            # Select voice based on part prefix
            if part.startswith("1F"):
                processed_text = part[2:]
                current_voice = voices["voice1F"]
            elif part.startswith("2F"):
                processed_text = part[2:]
                current_voice = voices["voice2F"]
            elif part.startswith("3F"):
                processed_text = part[2:]
                current_voice = voices["voice3F"]
            elif part.startswith("1M"):
                processed_text = part[2:]
                current_voice = voices["voice1"]
            elif part.startswith("2M"):
                processed_text = part[2:]
                current_voice = voices["voice2"]
            elif part.startswith("3M"):
                processed_text = part[2:]
                current_voice = voices["voice3"]
            elif part.startswith("1C"):
                processed_text = part[2:]
                current_voice = voices["voice4"]
            elif part.startswith("1O"):
                processed_text = part[2:]
                current_voice = voices["voice5"]
                current_pitch = -30
                current_rate = -20
            else:
                current_voice = (voice or voices["voice1"]).split(" - ")[0]
                processed_text = part[:]

            rate_str = f"{current_rate:+d}%"
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)

            # Save speech output to temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None)  # Empty string

    return audio_segments, []  # Returning empty list for silence times as we are directly creating silent WAV

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            final_audio_segments.extend(audio_paths)

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".wav")
    with wave.open(combined_audio_path, 'w') as outfile:
        first_audio = True
        sample_rate = None
        num_channels = None
        sample_width = None

        for segment_path in final_audio_segments:
            if isinstance(segment_path, str):
                try:
                    with wave.open(segment_path, 'rb') as infile:
                        current_num_channels = infile.getnchannels()
                        current_sample_rate = infile.getframerate()
                        current_sample_width = infile.getsampwidth()
                        frames = infile.readframes(infile.getnframes())

                        if first_audio:
                            num_channels = current_num_channels
                            sample_rate = current_sample_rate
                            sample_width = current_sample_width
                            outfile.setnchannels(num_channels)
                            outfile.setframerate(sample_rate)
                            outfile.setsampwidth(sample_width)
                            first_audio = False
                        elif (current_num_channels != num_channels or
                              current_sample_rate != sample_rate or
                              current_sample_width != sample_width):
                            print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
                            continue

                        outfile.writeframes(frames)
                    os.remove(segment_path)  # Clean up individual files
                except wave.Error as e:
                    print(f"Warning: Error reading WAV file {segment_path}: {e}")
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment_path}")

    return combined_audio_path, None

# Gradio interface function (wrapper to run async code)
def tts_interface_sync(text, voice, rate, pitch):
    return asyncio.run(tts_interface(text, voice, rate, pitch))

# Gradio interface
async def create_demo():
    voices = await get_voices()  # Now this function is defined
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
    You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
    Enter your text, select a voice, and adjust the speech rate and pitch.
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface_sync,
        inputs=[ 
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[ 
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()