cnph001 commited on
Commit
451c102
·
verified ·
1 Parent(s): 049675e

Update app.py

Browse files

Fix "no audio from edge TTS error"

Files changed (1) hide show
  1. app.py +63 -120
app.py CHANGED
@@ -1,4 +1,5 @@
1
  ##fix overlap, remove silence, leave a tiny bit of silence
 
2
 
3
  import spaces
4
  import gradio as gr
@@ -61,136 +62,78 @@ async def get_voices():
61
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
62
 
63
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
64
- """Generates audio for a text segment, handling voice prefixes."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  current_voice_full = default_voice
66
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
67
  current_rate = rate
68
  current_pitch = pitch
69
  processed_text = text_segment.strip()
70
- voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
71
- voice1_short = voice1_full.split(" - ")[0]
72
- voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
73
- voice1F_short = voice1F_full.split(" - ")[0]
74
- voice2_full = "en-GB-RyanNeural - en-GB (Male)"
75
- voice2_short = voice2_full.split(" - ")[0]
76
- voice2F_full = "en-US-JennyNeural - en-US (Female)"
77
- voice2F_short = voice2F_full.split(" - ")[0]
78
- voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
79
- voice3_short = voice3_full.split(" - ")[0]
80
- voice3F_full = "en-HK-YanNeural - en-HK (Female)"
81
- voice3F_short = voice3F_full.split(" - ")[0]
82
- voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
83
- voice4_short = voice4_full.split(" - ")[0]
84
- voice4F_full ="en-US-EmmaNeural - en-US (Female)"
85
- voice4F_short = voice4_full.split(" - ")[0]
86
- voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
87
- voice5_short = voice5_full.split(" - ")[0]
88
- voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
89
- voice6_short = voice6_full.split(" - ")[0]
90
- voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)" #Vietnamese
91
- voice7_short = voice7_full.split(" - ")[0]
92
- voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)" #Vietnamese
93
- voice8_short = voice8_full.split(" - ")[0]
94
- voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)" #Vietnamese
95
- voice9F_short = voice7_full.split(" - ")[0]
96
- voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)" #Vietnamese
97
- voice9_short = voice8_full.split(" - ")[0]
98
- detect=0
99
- if processed_text.startswith("1F"):
100
- current_voice_short = voice1F_short
101
- current_pitch = 25
102
- detect=1
103
- #processed_text = processed_text[2:].strip()
104
- elif processed_text.startswith("2F"):
105
- current_voice_short = voice2F_short
106
- #processed_text = processed_text[2:].strip()
107
- detect=1
108
- elif processed_text.startswith("3F"):
109
- current_voice_short = voice3F_short
110
- #processed_text = processed_text[2:].strip()
111
- detect=1
112
- elif processed_text.startswith("4F"):
113
- current_voice_short = voice4F_short
114
- #processed_text = processed_text[2:].strip()
115
- detect=1
116
- elif processed_text.startswith("1M"):
117
- current_voice_short = voice1_short
118
- #processed_text = processed_text[2:].strip()
119
- detect=1
120
- elif processed_text.startswith("2M"):
121
- current_voice_short = voice2_short
122
- #processed_text = processed_text[2:].strip()
123
- detect=1
124
- elif processed_text.startswith("3M"):
125
- current_voice_short = voice3_short
126
- #processed_text = processed_text[2:].strip()
127
- detect=1
128
- elif processed_text.startswith("4M"):
129
- current_voice_short = voice4_short
130
- #processed_text = processed_text[2:].strip()
131
- detect=1
132
- elif processed_text.startswith("1O"): # Old man voice
133
- current_voice_short = voice5_short
134
- current_pitch = -20
135
- current_rate = -10
136
- #processed_text = processed_text[2:].strip()
137
- detect=1
138
- elif processed_text.startswith("1C"): #Child voice
139
- current_voice_short = voice6_short
140
- #processed_text = processed_text[2:].strip()
141
- detect=1
142
- elif processed_text.startswith("1V"): #Female VN
143
- current_voice_short = voice7_short
144
- #processed_text = processed_text[2:].strip()
145
- detect=1
146
- elif processed_text.startswith("2V"):
147
- current_voice_short = voice8_short
148
- #processed_text = processed_text[2:].strip()
149
- detect=1
150
- elif processed_text.startswith("3V"): #Female VN
151
- current_voice_short = voice9F_short
152
- current_pitch = 25
153
- #processed_text = processed_text[2:].strip()
154
- detect=1
155
- elif processed_text.startswith("4V"):
156
- current_voice_short = voice9_short
157
- current_pitch = -20
158
- #processed_text = processed_text[2:].strip()
159
- detect=1
160
- #Looking for number following prefix, which are pitch values.
161
- #match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
162
- match = re.search(r'[A-Za-z]+\-?\d+', processed_text) # Look for a letter(s) followed by an optional '-' and digits
163
  if match:
164
- # Extract the prefix (e.g., '2F') and number (e.g., '-20')
165
- prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
166
- number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
167
  current_pitch += number
168
- # Step 2: Remove the found number from the string
169
- new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip() # Remove prefix and number (e.g., '2F-20')
170
- #processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
171
- processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
172
- else:
173
- if detect:
174
- processed_text = processed_text[2:]
175
  if processed_text:
176
  rate_str = f"{current_rate:+d}%"
177
  pitch_str = f"{current_pitch:+d}Hz"
178
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
179
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
180
- audio_path = tmp_file.name
181
- await communicate.save(audio_path)
182
-
183
- # Load the audio from file
184
- audio = AudioSegment.from_mp3(audio_path)
185
-
186
- # Strip silence at start and end
187
- audio = strip_silence(audio, silence_thresh=-40, min_silence_len=100)
188
-
189
- # Save the stripped version back to file
190
- stripped_path = tempfile.mktemp(suffix=".mp3")
191
- audio.export(stripped_path, format="mp3")
192
-
193
- return stripped_path
 
 
 
 
 
 
 
 
194
  return None
195
 
196
  async def process_transcript_line(line, default_voice, rate, pitch):
 
1
  ##fix overlap, remove silence, leave a tiny bit of silence
2
+ ## Simplified
3
 
4
  import spaces
5
  import gradio as gr
 
62
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
63
 
64
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
65
+ """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
66
+
67
+ voice_map = {
68
+ "1F": ("en-GB-SoniaNeural", 25, 0),
69
+ "2F": ("en-US-JennyNeural", 0, 0),
70
+ "3F": ("en-HK-YanNeural", 0, 0),
71
+ "4F": ("en-US-EmmaNeural", 0, 0),
72
+ "1M": ("en-AU-WilliamNeural", 0, 0),
73
+ "2M": ("en-GB-RyanNeural", 0, 0),
74
+ "3M": ("en-US-BrianMultilingualNeural", 0, 0),
75
+ "4M": ("en-GB-ThomasNeural", 0, 0),
76
+ "1O": ("en-GB-RyanNeural", -20, -10),
77
+ "1C": ("en-GB-MaisieNeural", 0, 0),
78
+ "1V": ("vi-VN-HoaiMyNeural", 0, 0),
79
+ "2V": ("vi-VN-NamMinhNeural", 0, 0),
80
+ "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
81
+ "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
82
+ }
83
+
84
  current_voice_full = default_voice
85
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
86
  current_rate = rate
87
  current_pitch = pitch
88
  processed_text = text_segment.strip()
89
+
90
+ detect = False
91
+
92
+ prefix = processed_text[:2]
93
+ if prefix in voice_map:
94
+ current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
95
+ current_pitch += pitch_adj
96
+ current_rate += rate_adj
97
+ detect = True
98
+
99
+ match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  if match:
101
+ group = match.group()
102
+ prefix_only = ''.join(filter(str.isalpha, group))
103
+ number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
104
  current_pitch += number
105
+ processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
106
+ processed_text = processed_text[len(prefix_only):].strip()
107
+ elif detect:
108
+ processed_text = processed_text[2:].strip()
109
+
 
 
110
  if processed_text:
111
  rate_str = f"{current_rate:+d}%"
112
  pitch_str = f"{current_pitch:+d}Hz"
113
+
114
+ # Retry logic
115
+ for attempt in range(3):
116
+ try:
117
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
118
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
119
+ audio_path = tmp_file.name
120
+ await communicate.save(audio_path)
121
+
122
+ audio = AudioSegment.from_mp3(audio_path)
123
+ audio = strip_silence(audio, silence_thresh=-40, min_silence_len=100)
124
+
125
+ stripped_path = tempfile.mktemp(suffix=".mp3")
126
+ audio.export(stripped_path, format="mp3")
127
+ return stripped_path
128
+ except Exception as e:
129
+ if attempt == 2:
130
+ # Final failure: return 500ms of silence
131
+ silent_audio = AudioSegment.silent(duration=500)
132
+ fallback_path = tempfile.mktemp(suffix=".mp3")
133
+ silent_audio.export(fallback_path, format="mp3")
134
+ return fallback_path
135
+ await asyncio.sleep(0.5) # brief wait before retry
136
+
137
  return None
138
 
139
  async def process_transcript_line(line, default_voice, rate, pitch):