Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -202,7 +202,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
202 |
timed_audio_segments = []
|
203 |
max_end_time_ms = 0
|
204 |
previous_end_time_ms = 0
|
205 |
-
next_start_time_ms = None
|
|
|
206 |
|
207 |
for i, line in enumerate(lines):
|
208 |
start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
|
@@ -220,56 +221,13 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
220 |
current_audio_duration = len(combined_line_audio)
|
221 |
intended_start_time = start_time
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
next_h, next_m, next_s, next_ms = next_line_match.groups()
|
228 |
-
next_start_time_ms = (
|
229 |
-
int(next_h) * 3600000 +
|
230 |
-
int(next_m) * 60000 +
|
231 |
-
int(next_s) * 1000 +
|
232 |
-
int(next_ms)
|
233 |
-
)
|
234 |
-
else:
|
235 |
-
next_start_time_ms = None
|
236 |
-
else:
|
237 |
-
next_start_time_ms = None
|
238 |
-
|
239 |
-
# Combine audio segments if current audio is longer than the time difference
|
240 |
-
while next_start_time_ms and current_audio_duration > (next_start_time_ms - start_time):
|
241 |
-
if i + 1 < len(lines):
|
242 |
-
next_start_time, next_audio_paths = await process_transcript_line(lines[i + 1], voice, rate, pitch)
|
243 |
-
if next_start_time is not None and next_audio_paths:
|
244 |
-
for next_path in next_audio_paths:
|
245 |
-
try:
|
246 |
-
next_audio = AudioSegment.from_mp3(next_path)
|
247 |
-
combined_line_audio += next_audio
|
248 |
-
os.remove(next_path)
|
249 |
-
except FileNotFoundError:
|
250 |
-
print(f"Warning: Audio file not found: {next_path}")
|
251 |
-
current_audio_duration = len(combined_line_audio)
|
252 |
-
i += 1 # Move to the next line
|
253 |
-
if i + 1 < len(lines):
|
254 |
-
next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
|
255 |
-
if next_line_match:
|
256 |
-
next_h, next_m, next_s, next_ms = next_line_match.groups()
|
257 |
-
next_start_time_ms = (
|
258 |
-
int(next_h) * 3600000 +
|
259 |
-
int(next_m) * 60000 +
|
260 |
-
int(next_s) * 1000 +
|
261 |
-
int(next_ms)
|
262 |
-
)
|
263 |
-
else:
|
264 |
-
next_start_time_ms = None
|
265 |
-
else:
|
266 |
-
next_start_time_ms = None
|
267 |
-
else:
|
268 |
-
break # Exit the loop if there are no more processable lines
|
269 |
-
else:
|
270 |
-
break
|
271 |
|
272 |
timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
|
|
|
273 |
previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
|
274 |
max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
|
275 |
elif audio_paths:
|
@@ -300,7 +258,7 @@ async def create_demo():
|
|
300 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
301 |
description = """
|
302 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
303 |
-
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "
|
304 |
Example:
|
305 |
```
|
306 |
00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
|
|
|
202 |
timed_audio_segments = []
|
203 |
max_end_time_ms = 0
|
204 |
previous_end_time_ms = 0
|
205 |
+
next_start_time_ms = None # Keep track of the start time of the *next* segment
|
206 |
+
previous_start_time_ms = 0
|
207 |
|
208 |
for i, line in enumerate(lines):
|
209 |
start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
|
|
|
221 |
current_audio_duration = len(combined_line_audio)
|
222 |
intended_start_time = start_time
|
223 |
|
224 |
+
if i > 0:
|
225 |
+
time_difference = start_time - previous_start_time_ms
|
226 |
+
if current_audio_duration > time_difference:
|
227 |
+
intended_start_time = previous_end_time_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
|
230 |
+
previous_start_time_ms = start_time
|
231 |
previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
|
232 |
max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
|
233 |
elif audio_paths:
|
|
|
258 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
259 |
description = """
|
260 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
261 |
+
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "1F Different Voice"
|
262 |
Example:
|
263 |
```
|
264 |
00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
|