Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on May 12

Commit

3330c34

verified ·

1 Parent(s): 218e261

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -7

app.py CHANGED Viewed

@@ -12,6 +12,27 @@ import soundfile as sf
 import numpy as np
 from pydub import AudioSegment
 from pydub.playback import play
 def get_silence(duration_ms=1000):
@@ -54,7 +75,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
-    print(f"Processing this  text segment: '{processed_text}'") # Debug
     voice_map = {
         "1F": "en-GB-SoniaNeural",
         "2M": "en-GB-RyanNeural",
@@ -72,6 +93,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
         "4V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
     }
     detect = 0
     for prefix, voice_short in voice_map.items():
         if processed_text.startswith(prefix):
             current_voice_short = voice_short
@@ -83,20 +105,23 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             detect = 1
             processed_text = processed_text[len(prefix):].strip()
             break
-    match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
     if match:
         prefix_pitch = match.group(1)
         number = int(match.group(2))
         if prefix_pitch in voice_map:
             current_pitch += number
-            processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
         elif detect:
             processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
-    elif detect:
-        processed_text = processed_text[2:].strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
         try:
             communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
@@ -230,11 +255,25 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
                 if combined_line_audio:
                     timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                     max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
             elif audio_paths:
                 for path in audio_paths:
                     if path:
@@ -242,6 +281,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                             os.remove(path)
                         except FileNotFoundError:
                             pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
@@ -313,4 +354,4 @@ async def create_demo():
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
-    demo.launch()

 import numpy as np
 from pydub import AudioSegment
 from pydub.playback import play
+import math
+from scipy.signal import butter, sosfiltfilt
+def apply_low_pass_filter(audio_segment, cutoff_freq, order=6):  ##added
+    """
+    Applies a low-pass filter to an AudioSegment.
+    Args:
+        audio_segment: The AudioSegment to filter.
+        cutoff_freq: The cutoff frequency in Hz.
+        order: The order of the Butterworth filter.
+    Returns:
+        A new AudioSegment with the filtered audio.
+    """
+    segment_array = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
+    frame_rate = audio_segment.frame_rate
+    nyquist_freq = 0.5 * frame_rate
+    normalized_cutoff = cutoff_freq / nyquist_freq
+    sos = butter(order, normalized_cutoff, btype='low', output='sos')
+    filtered_array = sosfiltfilt(sos, segment_array)
+    return audio_segment._spawn(filtered_array.astype(audio_segment.sample_width * 8 // 8))
 def get_silence(duration_ms=1000):
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
+    #print(f"Processing this  text segment: '{processed_text}'") # Debug
     voice_map = {
         "1F": "en-GB-SoniaNeural",
         "2M": "en-GB-RyanNeural",
         "4V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
     }
     detect = 0
+    #iterate throught the voice map to see if a match if found, if found then set the voice
     for prefix, voice_short in voice_map.items():
         if processed_text.startswith(prefix):
             current_voice_short = voice_short
             detect = 1
             processed_text = processed_text[len(prefix):].strip()
             break
+    #match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
+    #example of match: XYZ-45: Group 1: XYZ, Group 2: -45
+    match = re.search(r'([A-Za-z]+)([-]?\d*)', processed_text)
     if match:
         prefix_pitch = match.group(1)
         number = int(match.group(2))
         if prefix_pitch in voice_map:
             current_pitch += number
+            #processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
+            processed_text = re.sub(r'([A-Za-z]+)([-]?\d*)', '', processed_text, count=1).strip()
         elif detect:
             processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
+        print(f"Sending to Edge: '{processed_text}'") # Debug
         try:
             communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
+            Rem1='''
                 if combined_line_audio:
                     timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                     max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
+                    '''
+                if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
+                    speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
+                    if speed_factor > 0:
+                        if speed_factor < 1.0:
+                            speed_factor = 1.0
+                        combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
+                        # Apply low-pass filter AFTER speed adjustment
+                        cutoff_freq = 7000.0  # Adjust as needed
+                        combined_line_audio = apply_low_pass_filter(combined_line_audio, cutoff_freq)
+                if combined_line_audio:
+                    timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
+                    max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
             elif audio_paths:
                 for path in audio_paths:
                     if path:
                             os.remove(path)
                         except FileNotFoundError:
                             pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
+    demo.launch()