Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,27 @@ import soundfile as sf
|
|
12 |
import numpy as np
|
13 |
from pydub import AudioSegment
|
14 |
from pydub.playback import play
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
def get_silence(duration_ms=1000):
|
@@ -54,7 +75,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
54 |
current_rate = rate
|
55 |
current_pitch = pitch
|
56 |
processed_text = text_segment.strip()
|
57 |
-
print(f"Processing this text segment: '{processed_text}'") # Debug
|
58 |
voice_map = {
|
59 |
"1F": "en-GB-SoniaNeural",
|
60 |
"2M": "en-GB-RyanNeural",
|
@@ -72,6 +93,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
72 |
"4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
|
73 |
}
|
74 |
detect = 0
|
|
|
75 |
for prefix, voice_short in voice_map.items():
|
76 |
if processed_text.startswith(prefix):
|
77 |
current_voice_short = voice_short
|
@@ -83,20 +105,23 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
83 |
detect = 1
|
84 |
processed_text = processed_text[len(prefix):].strip()
|
85 |
break
|
86 |
-
match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
|
|
|
|
|
87 |
if match:
|
88 |
prefix_pitch = match.group(1)
|
89 |
number = int(match.group(2))
|
90 |
if prefix_pitch in voice_map:
|
91 |
current_pitch += number
|
92 |
-
processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
|
|
|
93 |
elif detect:
|
94 |
processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
|
95 |
-
|
96 |
-
processed_text = processed_text[2:].strip()
|
97 |
if processed_text:
|
98 |
rate_str = f"{current_rate:+d}%"
|
99 |
pitch_str = f"{current_pitch:+d}Hz"
|
|
|
100 |
try:
|
101 |
communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
|
102 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
@@ -230,11 +255,25 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
|
|
230 |
if speed_factor < 1.0:
|
231 |
speed_factor = 1.0
|
232 |
combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
|
233 |
-
|
234 |
if combined_line_audio:
|
235 |
timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
|
236 |
max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
|
|
|
|
|
|
|
|
238 |
elif audio_paths:
|
239 |
for path in audio_paths:
|
240 |
if path:
|
@@ -242,6 +281,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
|
|
242 |
os.remove(path)
|
243 |
except FileNotFoundError:
|
244 |
pass # Clean up even if no timestamp
|
|
|
|
|
245 |
|
246 |
if not timed_audio_segments:
|
247 |
return None, "No processable audio segments found."
|
@@ -313,4 +354,4 @@ async def create_demo():
|
|
313 |
|
314 |
if __name__ == "__main__":
|
315 |
demo = asyncio.run(create_demo())
|
316 |
-
demo.launch()
|
|
|
12 |
import numpy as np
|
13 |
from pydub import AudioSegment
|
14 |
from pydub.playback import play
|
15 |
+
import math
|
16 |
+
from scipy.signal import butter, sosfiltfilt
|
17 |
+
|
18 |
+
|
19 |
+
def apply_low_pass_filter(audio_segment, cutoff_freq, order=6): ##added
|
20 |
+
"""
|
21 |
+
Applies a low-pass filter to an AudioSegment.
|
22 |
+
Args:
|
23 |
+
audio_segment: The AudioSegment to filter.
|
24 |
+
cutoff_freq: The cutoff frequency in Hz.
|
25 |
+
order: The order of the Butterworth filter.
|
26 |
+
Returns:
|
27 |
+
A new AudioSegment with the filtered audio.
|
28 |
+
"""
|
29 |
+
segment_array = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
30 |
+
frame_rate = audio_segment.frame_rate
|
31 |
+
nyquist_freq = 0.5 * frame_rate
|
32 |
+
normalized_cutoff = cutoff_freq / nyquist_freq
|
33 |
+
sos = butter(order, normalized_cutoff, btype='low', output='sos')
|
34 |
+
filtered_array = sosfiltfilt(sos, segment_array)
|
35 |
+
return audio_segment._spawn(filtered_array.astype(audio_segment.sample_width * 8 // 8))
|
36 |
|
37 |
|
38 |
def get_silence(duration_ms=1000):
|
|
|
75 |
current_rate = rate
|
76 |
current_pitch = pitch
|
77 |
processed_text = text_segment.strip()
|
78 |
+
#print(f"Processing this text segment: '{processed_text}'") # Debug
|
79 |
voice_map = {
|
80 |
"1F": "en-GB-SoniaNeural",
|
81 |
"2M": "en-GB-RyanNeural",
|
|
|
93 |
"4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
|
94 |
}
|
95 |
detect = 0
|
96 |
+
#iterate throught the voice map to see if a match if found, if found then set the voice
|
97 |
for prefix, voice_short in voice_map.items():
|
98 |
if processed_text.startswith(prefix):
|
99 |
current_voice_short = voice_short
|
|
|
105 |
detect = 1
|
106 |
processed_text = processed_text[len(prefix):].strip()
|
107 |
break
|
108 |
+
#match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
|
109 |
+
#example of match: XYZ-45: Group 1: XYZ, Group 2: -45
|
110 |
+
match = re.search(r'([A-Za-z]+)([-]?\d*)', processed_text)
|
111 |
if match:
|
112 |
prefix_pitch = match.group(1)
|
113 |
number = int(match.group(2))
|
114 |
if prefix_pitch in voice_map:
|
115 |
current_pitch += number
|
116 |
+
#processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
|
117 |
+
processed_text = re.sub(r'([A-Za-z]+)([-]?\d*)', '', processed_text, count=1).strip()
|
118 |
elif detect:
|
119 |
processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
|
120 |
+
|
|
|
121 |
if processed_text:
|
122 |
rate_str = f"{current_rate:+d}%"
|
123 |
pitch_str = f"{current_pitch:+d}Hz"
|
124 |
+
print(f"Sending to Edge: '{processed_text}'") # Debug
|
125 |
try:
|
126 |
communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
|
127 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
|
|
255 |
if speed_factor < 1.0:
|
256 |
speed_factor = 1.0
|
257 |
combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
|
258 |
+
Rem1='''
|
259 |
if combined_line_audio:
|
260 |
timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
|
261 |
max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
|
262 |
+
'''
|
263 |
+
if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
|
264 |
+
speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
|
265 |
+
if speed_factor > 0:
|
266 |
+
if speed_factor < 1.0:
|
267 |
+
speed_factor = 1.0
|
268 |
+
combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
|
269 |
+
# Apply low-pass filter AFTER speed adjustment
|
270 |
+
cutoff_freq = 7000.0 # Adjust as needed
|
271 |
+
combined_line_audio = apply_low_pass_filter(combined_line_audio, cutoff_freq)
|
272 |
|
273 |
+
if combined_line_audio:
|
274 |
+
timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
|
275 |
+
max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
|
276 |
+
|
277 |
elif audio_paths:
|
278 |
for path in audio_paths:
|
279 |
if path:
|
|
|
281 |
os.remove(path)
|
282 |
except FileNotFoundError:
|
283 |
pass # Clean up even if no timestamp
|
284 |
+
|
285 |
+
|
286 |
|
287 |
if not timed_audio_segments:
|
288 |
return None, "No processable audio segments found."
|
|
|
354 |
|
355 |
if __name__ == "__main__":
|
356 |
demo = asyncio.run(create_demo())
|
357 |
+
demo.launch()
|