cnph001 commited on
Commit
2928892
·
verified ·
1 Parent(s): 7042e46

Change to output WAV

Browse files
Files changed (1) hide show
  1. app.py +94 -78
app.py CHANGED
@@ -4,41 +4,39 @@ import edge_tts
4
  import asyncio
5
  import tempfile
6
  import os
7
- import re # Import the regular expression module
8
  import struct
9
-
10
- # Function to create a temporary silent MP3 file (basic approximation)
11
- def create_silent_mp3(duration, temp_dir):
12
- frame_rate = 44100
13
- num_channels = 1
14
- sample_width = 2 # bytes (16-bit)
15
- num_frames = int(duration * frame_rate)
 
 
16
  silent_data = b'\x00' * (num_frames * num_channels * sample_width)
17
 
18
- temp_silent_file = os.path.join(temp_dir, f"silent_{duration}.raw")
19
- with open(temp_silent_file, 'wb') as f:
20
- f.write(silent_data)
21
-
22
- # This is a very basic way to make it look like an MP3 - it won't be a valid MP3.
23
- # For a proper MP3, you'd need an MP3 encoding library or ffmpeg.
24
- temp_mp3_path = os.path.join(temp_dir, f"silent_{duration}.mp3")
25
- with open(temp_mp3_path, 'wb') as f:
26
- f.write(b'\xff\xfb\x90\x00\x00\x00\x00') # Minimal MP3 header (very simplified)
27
- f.write(silent_data) # Append raw silence
28
 
29
- os.remove(temp_silent_file) # Clean up the raw file
30
- return temp_mp3_path
31
-
32
- # Text-to-speech function for a single paragraph with SS handling
33
  async def paragraph_to_speech(text, voice, rate, pitch):
34
- voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
35
- voice1F ="en-US-EmmaNeural - en-US (Female)"
36
- voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
37
- voice2F = "en-US-JennyNeural - en-US (Female)"
38
- voice1 = "en-AU-WilliamNeural - en-AU (Male)"
39
- voice3F = "en-HK-YanNeural - en-HK (Female)"
40
- voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
41
- voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
 
 
42
 
43
  if not text.strip():
44
  return None, [] # Return None for audio path and empty list for silence
@@ -51,8 +49,8 @@ async def paragraph_to_speech(text, voice, rate, pitch):
51
  if re.match(r'SS\d+\.?\d*', part):
52
  try:
53
  silence_duration = float(part[2:])
54
- silent_mp3_path = create_silent_mp3(silence_duration, temp_dir)
55
- audio_segments.append(silent_mp3_path)
56
  except ValueError:
57
  print(f"Warning: Invalid silence duration format: {part}")
58
  elif part.strip():
@@ -61,47 +59,50 @@ async def paragraph_to_speech(text, voice, rate, pitch):
61
  current_rate = rate
62
  current_pitch = pitch
63
 
 
64
  if part.startswith("1F"):
65
  processed_text = part[2:]
66
- current_voice = voice1F.split(" - ")[0]
67
  elif part.startswith("2F"):
68
  processed_text = part[2:]
69
- current_voice = voice2F.split(" - ")[0]
70
  elif part.startswith("3F"):
71
  processed_text = part[2:]
72
- current_voice = voice3F.split(" - ")[0]
73
  elif part.startswith("1M"):
74
  processed_text = part[2:]
75
- current_voice = voice1.split(" - ")[0]
76
  elif part.startswith("2M"):
77
  processed_text = part[2:]
78
- current_voice = voice2.split(" - ")[0]
79
  elif part.startswith("3M"):
80
  processed_text = part[2:]
81
- current_voice = voice3.split(" - ")[0]
82
  elif part.startswith("1C"):
83
  processed_text = part[2:]
84
- current_voice = voice4.split(" - ")[0]
85
  elif part.startswith("1O"):
86
  processed_text = part[2:]
87
- current_voice = voice5.split(" - ")[0]
88
  current_pitch = -30
89
  current_rate = -20
90
  else:
91
- current_voice = (voice or default_voice).split(" - ")[0]
92
- processed_text=part[:]
 
93
  rate_str = f"{current_rate:+d}%"
94
  pitch_str = f"{current_pitch:+d}Hz"
95
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
96
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
 
 
97
  tmp_path = tmp_file.name
98
  await communicate.save(tmp_path)
99
  audio_segments.append(tmp_path)
100
  else:
101
- #pass # Ignore empty parts
102
- audio_segments.append(None) # Empty string
103
 
104
- return audio_segments, silence_durations
105
 
106
  # Main text-to-speech function that processes paragraphs and silence
107
  async def text_to_speech(text, voice, rate, pitch):
@@ -110,16 +111,13 @@ async def text_to_speech(text, voice, rate, pitch):
110
  if not voice:
111
  return None, gr.Warning("Please select a voice.")
112
 
113
- paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
114
  final_audio_segments = []
115
 
116
  for paragraph in paragraphs:
117
- audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
118
  if audio_paths:
119
- for i, path in enumerate(audio_paths):
120
- final_audio_segments.append(path)
121
- if i < len(silence_times):
122
- final_audio_segments.append(silence_times[i])
123
 
124
  if not any(isinstance(item, str) for item in final_audio_segments):
125
  return None, None # No actual audio generated
@@ -127,35 +125,53 @@ async def text_to_speech(text, voice, rate, pitch):
127
  if all(not isinstance(item, str) for item in final_audio_segments):
128
  return None, "Only silence markers found."
129
 
130
- combined_audio_path = tempfile.mktemp(suffix=".mp3")
131
- with open(combined_audio_path, 'wb') as outfile:
132
- for segment in final_audio_segments:
133
- if isinstance(segment, str):
 
 
 
 
 
134
  try:
135
- with open(segment, 'rb') as infile:
136
- outfile.write(infile.read())
137
- os.remove(segment) # Clean up individual files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except FileNotFoundError:
139
- print(f"Warning: Audio file not found: {segment}")
140
- elif isinstance(segment, (int, float)):
141
- # Basic silence insertion (approximate)
142
- silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
143
- outfile.write(silence)
144
 
145
  return combined_audio_path, None
146
 
147
- # Gradio interface function
148
- @spaces.GPU
149
- def tts_interface(text, voice, rate, pitch):
150
- audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
151
- return audio, warning
152
-
153
- # Create Gradio application
154
- import gradio as gr
155
 
 
156
  async def create_demo():
157
  voices = await get_voices()
158
- default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices
159
  description = """
160
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
161
  You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
@@ -164,18 +180,18 @@ async def create_demo():
164
  """
165
 
166
  demo = gr.Interface(
167
- fn=tts_interface,
168
- inputs=[
169
  gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
170
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
171
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
172
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
173
  ],
174
- outputs=[
175
  gr.Audio(label="Generated Audio", type="filepath"),
176
  gr.Markdown(label="Warning", visible=False)
177
  ],
178
- title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
179
  description=description,
180
  article="Process text paragraph by paragraph for smoother output and insert silence markers.",
181
  analytics_enabled=False,
@@ -186,4 +202,4 @@ async def create_demo():
186
  # Run the application
187
  if __name__ == "__main__":
188
  demo = asyncio.run(create_demo())
189
- demo.launch()
 
4
  import asyncio
5
  import tempfile
6
  import os
7
+ import re
8
  import struct
9
+ import wave
10
+
11
+ # Function to create a temporary silent WAV file
12
+ def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
13
+ """Creates a temporary WAV file containing silence."""
14
+ if duration <= 0:
15
+ raise ValueError("Duration must be positive.")
16
+
17
+ num_frames = int(duration * sample_rate)
18
  silent_data = b'\x00' * (num_frames * num_channels * sample_width)
19
 
20
+ temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
21
+ with wave.open(temp_wav_path, 'w') as wf:
22
+ wf.setnchannels(num_channels)
23
+ wf.setframerate(sample_rate)
24
+ wf.setsampwidth(sample_width)
25
+ wf.writeframes(silent_data)
26
+ return temp_wav_path
 
 
 
27
 
28
+ # Function to process text and generate audio for a single paragraph
 
 
 
29
  async def paragraph_to_speech(text, voice, rate, pitch):
30
+ voices = {
31
+ "voice1F": "en-US-EmmaNeural - en-US (Female)",
32
+ "voice2F": "en-US-JennyNeural - en-US (Female)",
33
+ "voice3F": "en-HK-YanNeural - en-HK (Female)",
34
+ "voice1": "en-AU-WilliamNeural - en-AU (Male)",
35
+ "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
36
+ "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
37
+ "voice4": "en-GB-MaisieNeural - en-GB (Female)", # Child
38
+ "voice5": "en-GB-RyanNeural - en-GB (Male)" # Old Man
39
+ }
40
 
41
  if not text.strip():
42
  return None, [] # Return None for audio path and empty list for silence
 
49
  if re.match(r'SS\d+\.?\d*', part):
50
  try:
51
  silence_duration = float(part[2:])
52
+ silent_wav_path = create_silent_wav(silence_duration, temp_dir)
53
+ audio_segments.append(silent_wav_path)
54
  except ValueError:
55
  print(f"Warning: Invalid silence duration format: {part}")
56
  elif part.strip():
 
59
  current_rate = rate
60
  current_pitch = pitch
61
 
62
+ # Select voice based on part prefix
63
  if part.startswith("1F"):
64
  processed_text = part[2:]
65
+ current_voice = voices["voice1F"]
66
  elif part.startswith("2F"):
67
  processed_text = part[2:]
68
+ current_voice = voices["voice2F"]
69
  elif part.startswith("3F"):
70
  processed_text = part[2:]
71
+ current_voice = voices["voice3F"]
72
  elif part.startswith("1M"):
73
  processed_text = part[2:]
74
+ current_voice = voices["voice1"]
75
  elif part.startswith("2M"):
76
  processed_text = part[2:]
77
+ current_voice = voices["voice2"]
78
  elif part.startswith("3M"):
79
  processed_text = part[2:]
80
+ current_voice = voices["voice3"]
81
  elif part.startswith("1C"):
82
  processed_text = part[2:]
83
+ current_voice = voices["voice4"]
84
  elif part.startswith("1O"):
85
  processed_text = part[2:]
86
+ current_voice = voices["voice5"]
87
  current_pitch = -30
88
  current_rate = -20
89
  else:
90
+ current_voice = (voice or voices["voice1"]).split(" - ")[0]
91
+ processed_text = part[:]
92
+
93
  rate_str = f"{current_rate:+d}%"
94
  pitch_str = f"{current_pitch:+d}Hz"
95
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
96
+
97
+ # Save speech output to temporary file
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
99
  tmp_path = tmp_file.name
100
  await communicate.save(tmp_path)
101
  audio_segments.append(tmp_path)
102
  else:
103
+ audio_segments.append(None) # Empty string
 
104
 
105
+ return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
106
 
107
  # Main text-to-speech function that processes paragraphs and silence
108
  async def text_to_speech(text, voice, rate, pitch):
 
111
  if not voice:
112
  return None, gr.Warning("Please select a voice.")
113
 
114
+ paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
115
  final_audio_segments = []
116
 
117
  for paragraph in paragraphs:
118
+ audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
119
  if audio_paths:
120
+ final_audio_segments.extend(audio_paths)
 
 
 
121
 
122
  if not any(isinstance(item, str) for item in final_audio_segments):
123
  return None, None # No actual audio generated
 
125
  if all(not isinstance(item, str) for item in final_audio_segments):
126
  return None, "Only silence markers found."
127
 
128
+ combined_audio_path = tempfile.mktemp(suffix=".wav")
129
+ with wave.open(combined_audio_path, 'w') as outfile:
130
+ first_audio = True
131
+ sample_rate = None
132
+ num_channels = None
133
+ sample_width = None
134
+
135
+ for segment_path in final_audio_segments:
136
+ if isinstance(segment_path, str):
137
  try:
138
+ with wave.open(segment_path, 'rb') as infile:
139
+ current_num_channels = infile.getnchannels()
140
+ current_sample_rate = infile.getframerate()
141
+ current_sample_width = infile.getsampwidth()
142
+ frames = infile.readframes(infile.getnframes())
143
+
144
+ if first_audio:
145
+ num_channels = current_num_channels
146
+ sample_rate = current_sample_rate
147
+ sample_width = current_sample_width
148
+ outfile.setnchannels(num_channels)
149
+ outfile.setframerate(sample_rate)
150
+ outfile.setsampwidth(sample_width)
151
+ first_audio = False
152
+ elif (current_num_channels != num_channels or
153
+ current_sample_rate != sample_rate or
154
+ current_sample_width != sample_width):
155
+ print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
156
+ continue
157
+
158
+ outfile.writeframes(frames)
159
+ os.remove(segment_path) # Clean up individual files
160
+ except wave.Error as e:
161
+ print(f"Warning: Error reading WAV file {segment_path}: {e}")
162
  except FileNotFoundError:
163
+ print(f"Warning: Audio file not found: {segment_path}")
 
 
 
 
164
 
165
  return combined_audio_path, None
166
 
167
+ # Gradio interface function (wrapper to run async code)
168
+ def tts_interface_sync(text, voice, rate, pitch):
169
+ return asyncio.run(tts_interface(text, voice, rate, pitch))
 
 
 
 
 
170
 
171
+ # Gradio interface
172
  async def create_demo():
173
  voices = await get_voices()
174
+ default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
175
  description = """
176
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
177
  You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
 
180
  """
181
 
182
  demo = gr.Interface(
183
+ fn=tts_interface_sync,
184
+ inputs=[
185
  gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
186
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
187
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
188
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
189
  ],
190
+ outputs=[
191
  gr.Audio(label="Generated Audio", type="filepath"),
192
  gr.Markdown(label="Warning", visible=False)
193
  ],
194
+ title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
195
  description=description,
196
  article="Process text paragraph by paragraph for smoother output and insert silence markers.",
197
  analytics_enabled=False,
 
202
  # Run the application
203
  if __name__ == "__main__":
204
  demo = asyncio.run(create_demo())
205
+ demo.launch()