cnph001 commited on
Commit
cdec9da
·
verified ·
1 Parent(s): f18fa5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -56
app.py CHANGED
@@ -1,6 +1,5 @@
1
- ## fix overlap, remove silence, leave a tiny bit of silence
2
  ## Simplified
3
- ## Permanent voice change implemented
4
 
5
  import spaces
6
  import gradio as gr
@@ -13,6 +12,9 @@ from pathlib import Path
13
  from pydub.silence import detect_nonsilent
14
  from pydub import AudioSegment
15
 
 
 
 
16
  def strip_silence(audio: AudioSegment, silence_thresh=-40, min_silence_len=100, silence_padding_ms=100):
17
  from pydub.silence import detect_nonsilent
18
  # Detect non-silent regions
@@ -62,12 +64,9 @@ async def get_voices():
62
  voices = await edge_tts.list_voices()
63
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
64
 
65
-
66
- ## EDIT
67
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
68
- """Generates audio for a text segment, handling permanent and temporary voice changes."""
69
-
70
- # Define the voice map for reference
71
  voice_map = {
72
  "1F": ("en-GB-SoniaNeural", 25, 0),
73
  "2F": ("en-US-JennyNeural", 0, 0),
@@ -77,69 +76,52 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
77
  "2M": ("en-GB-RyanNeural", 0, 0),
78
  "3M": ("en-US-BrianMultilingualNeural", 0, 0),
79
  "4M": ("en-GB-ThomasNeural", 0, 0),
80
- "1O": ("en-GB-RyanNeural", -20, -10), # Old man
81
- "1C": ("en-GB-MaisieNeural", 0, 0), # Child
82
  "1V": ("vi-VN-HoaiMyNeural", 0, 0),
83
  "2V": ("vi-VN-NamMinhNeural", 0, 0),
84
  "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
85
  "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
86
  }
87
-
88
- # Initialize current voice and processing variables
89
- current_voice_full = default_voice
90
- current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
 
91
  current_rate = rate
92
  current_pitch = pitch
93
  processed_text = text_segment.strip()
94
 
95
- # Track permanent voice and temporary changes
96
- permanent_voice = current_voice_short
97
- temp_voice = None
98
 
99
- # We'll process the text and adjust voices accordingly
100
- result = []
101
- idx = 0
102
- while idx < len(processed_text):
103
- # Detect potential voice change
104
- match = re.match(r"(1F|2F|3F|4F|1M|2M|3M|4M|1O|1C|1V|2V|3V|4V)(P?)(-?\d+)?", processed_text[idx:])
105
-
106
- if match:
107
- prefix = match.group(1)
108
- permanent_flag = match.group(2) == 'P' # Check if it's a permanent change
109
- pitch_modifier = match.group(3) # This will be None or a number
110
-
111
- if permanent_flag:
112
- # Permanent voice change (e.g., "4VP")
113
- permanent_voice, pitch_adj, rate_adj = voice_map[prefix]
114
- current_pitch += pitch_adj
115
- current_rate += rate_adj
116
- result.append(f"<perm>{prefix}P") # Mark as permanent change
117
- temp_voice = None # Clear temporary voice changes
118
- elif pitch_modifier:
119
- # Temporary pitch adjustment (e.g., "4V-10" or "4V+5")
120
- pitch_adjustment = int(pitch_modifier)
121
- current_pitch += pitch_adjustment
122
- result.append(f"<temp>{prefix}{pitch_modifier}") # Mark as temporary change
123
-
124
- # Move index forward past the match
125
- idx += len(match.group(0))
126
- continue
127
-
128
- # If no match, just add the normal text character
129
- result.append(processed_text[idx])
130
- idx += 1
131
-
132
- # Rebuild the text with permanent and temporary voice marks
133
- final_processed_text = ''.join(result).strip()
134
 
135
- if final_processed_text:
136
  rate_str = f"{current_rate:+d}%"
137
  pitch_str = f"{current_pitch:+d}Hz"
138
 
139
- # Retry logic for TTS
140
  for attempt in range(3):
141
  try:
142
- communicate = edge_tts.Communicate(final_processed_text, permanent_voice, rate=rate_str, pitch=pitch_str)
143
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
144
  audio_path = tmp_file.name
145
  await communicate.save(audio_path)
@@ -151,16 +133,16 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
151
  audio.export(stripped_path, format="mp3")
152
  return stripped_path
153
  except Exception as e:
 
154
  if attempt == 2:
155
  # Final failure: return 500ms of silence
156
  silent_audio = AudioSegment.silent(duration=500)
157
  fallback_path = tempfile.mktemp(suffix=".mp3")
158
  silent_audio.export(fallback_path, format="mp3")
159
  return fallback_path
160
- await asyncio.sleep(0.5) # Retry after brief pause
161
 
162
  return None
163
- ### END EDIT
164
 
165
  async def process_transcript_line(line, default_voice, rate, pitch):
166
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
@@ -318,6 +300,7 @@ async def create_demo():
318
  2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
319
  3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
320
  4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
 
321
  ****************************************************************************************************
322
  """
323
  demo = gr.Interface(
 
1
+ ##fix overlap, remove silence, leave a tiny bit of silence
2
  ## Simplified
 
3
 
4
  import spaces
5
  import gradio as gr
 
12
  from pydub.silence import detect_nonsilent
13
  from pydub import AudioSegment
14
 
15
+ flagpermanent = False
16
+ default_voice_short= ""
17
+
18
  def strip_silence(audio: AudioSegment, silence_thresh=-40, min_silence_len=100, silence_padding_ms=100):
19
  from pydub.silence import detect_nonsilent
20
  # Detect non-silent regions
 
64
  voices = await edge_tts.list_voices()
65
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
66
 
 
 
67
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
68
+ """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
69
+ print(f"Text: {text_segment}") #Debug
 
70
  voice_map = {
71
  "1F": ("en-GB-SoniaNeural", 25, 0),
72
  "2F": ("en-US-JennyNeural", 0, 0),
 
76
  "2M": ("en-GB-RyanNeural", 0, 0),
77
  "3M": ("en-US-BrianMultilingualNeural", 0, 0),
78
  "4M": ("en-GB-ThomasNeural", 0, 0),
79
+ "1O": ("en-GB-RyanNeural", -20, -10),
80
+ "1C": ("en-GB-MaisieNeural", 0, 0),
81
  "1V": ("vi-VN-HoaiMyNeural", 0, 0),
82
  "2V": ("vi-VN-NamMinhNeural", 0, 0),
83
  "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
84
  "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
85
  }
86
+ if default_voice_short == "":
87
+ current_voice_full = default_voice
88
+ current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
89
+ else:
90
+ current_voice_short = default_voice_short
91
  current_rate = rate
92
  current_pitch = pitch
93
  processed_text = text_segment.strip()
94
 
95
+ detect = False
 
 
96
 
97
+ prefix = processed_text[:2]
98
+ if prefix in voice_map:
99
+ current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
100
+ current_pitch += pitch_adj
101
+ current_rate += rate_adj
102
+ detect = True
103
+
104
+ match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
105
+ if match:
106
+ group = match.group()
107
+ prefix_only = ''.join(filter(str.isalpha, group))
108
+ number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
109
+ if number=0:
110
+ default_voice_short= current_voice_short
111
+ current_pitch += number
112
+ processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
113
+ processed_text = processed_text[len(prefix_only):].strip()
114
+ elif detect:
115
+ processed_text = processed_text[2:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ if processed_text:
118
  rate_str = f"{current_rate:+d}%"
119
  pitch_str = f"{current_pitch:+d}Hz"
120
 
121
+ # Retry logic
122
  for attempt in range(3):
123
  try:
124
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
125
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
126
  audio_path = tmp_file.name
127
  await communicate.save(audio_path)
 
133
  audio.export(stripped_path, format="mp3")
134
  return stripped_path
135
  except Exception as e:
136
+ print(f"Edge TTS Failed# {attempt}:: {e}") #Debug
137
  if attempt == 2:
138
  # Final failure: return 500ms of silence
139
  silent_audio = AudioSegment.silent(duration=500)
140
  fallback_path = tempfile.mktemp(suffix=".mp3")
141
  silent_audio.export(fallback_path, format="mp3")
142
  return fallback_path
143
+ await asyncio.sleep(0.5) # brief wait before retry
144
 
145
  return None
 
146
 
147
  async def process_transcript_line(line, default_voice, rate, pitch):
148
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
 
300
  2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
301
  3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
302
  4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
303
+ Add 0 after Prefix to make it permanent voice
304
  ****************************************************************************************************
305
  """
306
  demo = gr.Interface(