NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on May 11

Commit

ed3d5af

verified ·

1 Parent(s): d358db3

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -37

app.py CHANGED Viewed

@@ -103,51 +103,25 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
                 #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
-                Offtext = """
-                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
-                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
-                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
-                    if speed_factor > 0:
-                        if speed_factor < 1.0:
-                            speed_factor = 1.0
-                        y, sr = librosa.load(audio_path, sr=None)
-                        y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
-                        sf.write(audio_path, y_stretched, sr)
-                """
                 if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
-                        # Load the audio file
                         y, sr = librosa.load(audio_path, sr=None)
-                        # Check if audio loading was successful
-                        if y is None or sr is None:
-                            raise ValueError(f"Error loading audio file: {audio_path}")
-                        # Use the phase vocoder for time stretching without pitch change
-                        hop_length = 512  # You can adjust this parameter
-                        phase_vocoder_output = librosa.phase_vocoder(y, rate=speed_factor, hop_length=hop_length)
-                        # Check the shape of the phase vocoder output
-                        if phase_vocoder_output is None or len(phase_vocoder_output) == 0:
-                            raise ValueError("Phase vocoder output is empty or None.")
-                        # Reconstruct the audio signal from the phase vocoder output
-                        try:
-                            # Check if length is properly handled, based on speed_factor
-                            if speed_factor < 1:
-                                y_stretched = librosa.istft(phase_vocoder_output, hop_length=hop_length, length=len(y))
-                            else:
-                                y_stretched = librosa.istft(phase_vocoder_output, hop_length=hop_length)
-                        except Exception as e:
-                            raise ValueError(f"Error during istft: {e}")
-                        # Save the time-stretched audio to the file
-                        sf.write(audio_path, y_stretched, sr)
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug

                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
                 #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
                 if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
+                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         y, sr = librosa.load(audio_path, sr=None)
+                        # Apply phase vocoder to stretch the audio
+                        hop_length = 512  # Hop length, you can experiment with this
+                        y_stretched = librosa.phase_vocoder(y, rate=speed_factor, hop_length=hop_length)
+                        # Reconstruct the audio using ISTFT
+                        y_reconstructed = librosa.istft(y_stretched, hop_length=hop_length)
+                        # Save the stretched audio back to a file
+                        sf.write(audio_path, y_reconstructed, sr)
+                        #y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
+                        #sf.write(audio_path, y_stretched, sr)
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug