cnph001 commited on
Commit
e4c6d2d
·
verified ·
1 Parent(s): c411b7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -33
app.py CHANGED
@@ -102,29 +102,20 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
102
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
103
  audio_path = tmp_file.name
104
  await communicate.save(audio_path)
105
- if target_duration_ms is not None and os.path.exists(audio_path):
106
  audio = AudioSegment.from_mp3(audio_path)
107
  audio_duration_ms = len(audio)
108
  #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
109
 
110
- if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
111
  speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
112
  #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
113
  if speed_factor > 0:
114
  if speed_factor < 1.0:
115
  speed_factor = 1.0
116
- #y, sr = librosa.load(audio_path, sr=None)
117
-
118
- # Load audio file
119
  audio = AudioSegment.from_file(audio_path)
120
- # Apply time-stretching
121
  audio_stretched = audio.speedup(playback_speed=speed_factor)
122
- # Save the stretched audio
123
  audio_stretched.export(audio_path, format="mp3")
124
-
125
- #y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
126
- #sf.write(audio_path, y_stretched, sr)
127
-
128
  else:
129
  print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
130
  return audio_path
@@ -133,24 +124,21 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
133
  return None
134
  return None
135
 
136
- async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
137
- """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
138
- match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
139
  if match:
140
- start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
141
  start_time_ms = (
142
  int(start_h) * 3600000 +
143
  int(start_m) * 60000 +
144
  int(start_s) * 1000 +
145
  int(start_ms)
146
  )
147
- end_time_ms = (
148
- int(end_h) * 3600000 +
149
- int(end_m) * 60000 +
150
- int(end_s) * 1000 +
151
- int(end_ms)
152
- )
153
- duration_ms = end_time_ms - start_time_ms
154
  audio_segments = []
155
  split_parts = re.split(r'[“”"]', text_parts)
156
  process_next = False
@@ -177,12 +165,22 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
177
  lines = transcript_text.strip().split('\n')
178
  timed_audio_segments = []
179
  max_end_time_ms = 0
180
- for line in lines:
181
- start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
 
 
 
 
 
 
 
 
 
 
 
 
182
  if start_time is not None and audio_paths:
183
  combined_line_audio = AudioSegment.empty()
184
- current_time_ms = start_time
185
- segment_duration = duration / len(audio_paths) if audio_paths else 0
186
  for path in audio_paths:
187
  if path: # Only process if audio_path is not None (meaning TTS was successful)
188
  try:
@@ -201,11 +199,14 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
201
  os.remove(path)
202
  except FileNotFoundError:
203
  pass # Clean up even if no timestamp
 
204
  if not timed_audio_segments:
205
  return None, "No processable audio segments found."
 
206
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
207
  for segment in timed_audio_segments:
208
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
 
209
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
210
  final_audio.export(combined_audio_path, format="mp3")
211
  return combined_audio_path, None
@@ -219,14 +220,16 @@ async def create_demo():
219
  voices = await get_voices()
220
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
221
  description = """
222
- Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
223
- The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
 
 
224
  You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
225
- Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
226
  Example:
227
  ```
228
- 00:00:00,000 - 00:00:05,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
229
- 00:00:05,500 - 00:00:10,250 "1C Yes," said the child, "it is fun!"
230
  ```
231
  ***************************************************************************************************
232
  1M = en-AU-WilliamNeural - en-AU (Male)
@@ -248,7 +251,7 @@ async def create_demo():
248
  demo = gr.Interface(
249
  fn=tts_interface,
250
  inputs=[
251
- gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
252
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
253
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
254
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
@@ -258,7 +261,7 @@ async def create_demo():
258
  gr.Audio(label="Generated Audio", type="filepath"),
259
  gr.Markdown(label="Warning", visible=False)
260
  ],
261
- title="TTS with Duration-Aware Speed Adjustment and In-Quote Voice Switching",
262
  description=description,
263
  analytics_enabled=False,
264
  allow_flagging=False
 
102
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
103
  audio_path = tmp_file.name
104
  await communicate.save(audio_path)
105
+ if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
106
  audio = AudioSegment.from_mp3(audio_path)
107
  audio_duration_ms = len(audio)
108
  #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
109
 
110
+ if audio_duration_ms > target_duration_ms:
111
  speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
112
  #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
113
  if speed_factor > 0:
114
  if speed_factor < 1.0:
115
  speed_factor = 1.0
 
 
 
116
  audio = AudioSegment.from_file(audio_path)
 
117
  audio_stretched = audio.speedup(playback_speed=speed_factor)
 
118
  audio_stretched.export(audio_path, format="mp3")
 
 
 
 
119
  else:
120
  print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
121
  return audio_path
 
124
  return None
125
  return None
126
 
127
+ async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, speed_adjustment_factor):
128
+ """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
129
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
130
  if match:
131
+ start_h, start_m, start_s, start_ms, text_parts = match.groups()
132
  start_time_ms = (
133
  int(start_h) * 3600000 +
134
  int(start_m) * 60000 +
135
  int(start_s) * 1000 +
136
  int(start_ms)
137
  )
138
+ duration_ms = None
139
+ if next_line_start_time is not None:
140
+ duration_ms = next_line_start_time - start_time_ms
141
+
 
 
 
142
  audio_segments = []
143
  split_parts = re.split(r'[“”"]', text_parts)
144
  process_next = False
 
165
  lines = transcript_text.strip().split('\n')
166
  timed_audio_segments = []
167
  max_end_time_ms = 0
168
+ for i, line in enumerate(lines):
169
+ next_line_start_time = None
170
+ if i < len(lines) - 1:
171
+ next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i+1])
172
+ if next_line_match:
173
+ nh, nm, ns, nms = next_line_match.groups()
174
+ next_line_start_time = (
175
+ int(nh) * 3600000 +
176
+ int(nm) * 60000 +
177
+ int(ns) * 1000 +
178
+ int(nms)
179
+ )
180
+
181
+ start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, speed_adjustment_factor)
182
  if start_time is not None and audio_paths:
183
  combined_line_audio = AudioSegment.empty()
 
 
184
  for path in audio_paths:
185
  if path: # Only process if audio_path is not None (meaning TTS was successful)
186
  try:
 
199
  os.remove(path)
200
  except FileNotFoundError:
201
  pass # Clean up even if no timestamp
202
+
203
  if not timed_audio_segments:
204
  return None, "No processable audio segments found."
205
+
206
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
207
  for segment in timed_audio_segments:
208
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
209
+
210
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
211
  final_audio.export(combined_audio_path, format="mp3")
212
  return combined_audio_path, None
 
220
  voices = await get_voices()
221
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
222
  description = """
223
+ Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
224
+ The duration for each segment is determined by the timestamp of the following line.
225
+ The speed of the generated audio will be adjusted to fit within this duration.
226
+ If there is no subsequent timestamp, the speed adjustment will be skipped.
227
  You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
228
+ Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
229
  Example:
230
  ```
231
+ 00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
232
+ 00:00:05,500 "1C Yes," said the child, "it is fun!"
233
  ```
234
  ***************************************************************************************************
235
  1M = en-AU-WilliamNeural - en-AU (Male)
 
251
  demo = gr.Interface(
252
  fn=tts_interface,
253
  inputs=[
254
+ gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 "Text" more text "1F Different Voice"'),
255
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
256
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
257
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
 
261
  gr.Audio(label="Generated Audio", type="filepath"),
262
  gr.Markdown(label="Warning", visible=False)
263
  ],
264
+ title="TTS with Dynamic Duration and In-Quote Voice Switching",
265
  description=description,
266
  analytics_enabled=False,
267
  allow_flagging=False