cnph001 commited on
Commit
3330c34
·
verified ·
1 Parent(s): 218e261

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -7
app.py CHANGED
@@ -12,6 +12,27 @@ import soundfile as sf
12
  import numpy as np
13
  from pydub import AudioSegment
14
  from pydub.playback import play
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def get_silence(duration_ms=1000):
@@ -54,7 +75,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
54
  current_rate = rate
55
  current_pitch = pitch
56
  processed_text = text_segment.strip()
57
- print(f"Processing this text segment: '{processed_text}'") # Debug
58
  voice_map = {
59
  "1F": "en-GB-SoniaNeural",
60
  "2M": "en-GB-RyanNeural",
@@ -72,6 +93,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
72
  "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
73
  }
74
  detect = 0
 
75
  for prefix, voice_short in voice_map.items():
76
  if processed_text.startswith(prefix):
77
  current_voice_short = voice_short
@@ -83,20 +105,23 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
83
  detect = 1
84
  processed_text = processed_text[len(prefix):].strip()
85
  break
86
- match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
 
 
87
  if match:
88
  prefix_pitch = match.group(1)
89
  number = int(match.group(2))
90
  if prefix_pitch in voice_map:
91
  current_pitch += number
92
- processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
 
93
  elif detect:
94
  processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
95
- elif detect:
96
- processed_text = processed_text[2:].strip()
97
  if processed_text:
98
  rate_str = f"{current_rate:+d}%"
99
  pitch_str = f"{current_pitch:+d}Hz"
 
100
  try:
101
  communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
102
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
@@ -230,11 +255,25 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
230
  if speed_factor < 1.0:
231
  speed_factor = 1.0
232
  combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
233
-
234
  if combined_line_audio:
235
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
236
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
 
 
 
 
 
 
 
 
 
 
237
 
 
 
 
 
238
  elif audio_paths:
239
  for path in audio_paths:
240
  if path:
@@ -242,6 +281,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
242
  os.remove(path)
243
  except FileNotFoundError:
244
  pass # Clean up even if no timestamp
 
 
245
 
246
  if not timed_audio_segments:
247
  return None, "No processable audio segments found."
@@ -313,4 +354,4 @@ async def create_demo():
313
 
314
  if __name__ == "__main__":
315
  demo = asyncio.run(create_demo())
316
- demo.launch()
 
12
  import numpy as np
13
  from pydub import AudioSegment
14
  from pydub.playback import play
15
+ import math
16
+ from scipy.signal import butter, sosfiltfilt
17
+
18
+
19
+ def apply_low_pass_filter(audio_segment, cutoff_freq, order=6): ##added
20
+ """
21
+ Applies a low-pass filter to an AudioSegment.
22
+ Args:
23
+ audio_segment: The AudioSegment to filter.
24
+ cutoff_freq: The cutoff frequency in Hz.
25
+ order: The order of the Butterworth filter.
26
+ Returns:
27
+ A new AudioSegment with the filtered audio.
28
+ """
29
+ segment_array = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
30
+ frame_rate = audio_segment.frame_rate
31
+ nyquist_freq = 0.5 * frame_rate
32
+ normalized_cutoff = cutoff_freq / nyquist_freq
33
+ sos = butter(order, normalized_cutoff, btype='low', output='sos')
34
+ filtered_array = sosfiltfilt(sos, segment_array)
35
+ return audio_segment._spawn(filtered_array.astype(audio_segment.sample_width * 8 // 8))
36
 
37
 
38
  def get_silence(duration_ms=1000):
 
75
  current_rate = rate
76
  current_pitch = pitch
77
  processed_text = text_segment.strip()
78
+ #print(f"Processing this text segment: '{processed_text}'") # Debug
79
  voice_map = {
80
  "1F": "en-GB-SoniaNeural",
81
  "2M": "en-GB-RyanNeural",
 
93
  "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
94
  }
95
  detect = 0
96
+ #iterate throught the voice map to see if a match if found, if found then set the voice
97
  for prefix, voice_short in voice_map.items():
98
  if processed_text.startswith(prefix):
99
  current_voice_short = voice_short
 
105
  detect = 1
106
  processed_text = processed_text[len(prefix):].strip()
107
  break
108
+ #match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
109
+ #example of match: XYZ-45: Group 1: XYZ, Group 2: -45
110
+ match = re.search(r'([A-Za-z]+)([-]?\d*)', processed_text)
111
  if match:
112
  prefix_pitch = match.group(1)
113
  number = int(match.group(2))
114
  if prefix_pitch in voice_map:
115
  current_pitch += number
116
+ #processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
117
+ processed_text = re.sub(r'([A-Za-z]+)([-]?\d*)', '', processed_text, count=1).strip()
118
  elif detect:
119
  processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
120
+
 
121
  if processed_text:
122
  rate_str = f"{current_rate:+d}%"
123
  pitch_str = f"{current_pitch:+d}Hz"
124
+ print(f"Sending to Edge: '{processed_text}'") # Debug
125
  try:
126
  communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
127
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
 
255
  if speed_factor < 1.0:
256
  speed_factor = 1.0
257
  combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
258
+ Rem1='''
259
  if combined_line_audio:
260
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
261
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
262
+ '''
263
+ if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
264
+ speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
265
+ if speed_factor > 0:
266
+ if speed_factor < 1.0:
267
+ speed_factor = 1.0
268
+ combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
269
+ # Apply low-pass filter AFTER speed adjustment
270
+ cutoff_freq = 7000.0 # Adjust as needed
271
+ combined_line_audio = apply_low_pass_filter(combined_line_audio, cutoff_freq)
272
 
273
+ if combined_line_audio:
274
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
275
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
276
+
277
  elif audio_paths:
278
  for path in audio_paths:
279
  if path:
 
281
  os.remove(path)
282
  except FileNotFoundError:
283
  pass # Clean up even if no timestamp
284
+
285
+
286
 
287
  if not timed_audio_segments:
288
  return None, "No processable audio segments found."
 
354
 
355
  if __name__ == "__main__":
356
  demo = asyncio.run(create_demo())
357
+ demo.launch()