cnph001 commited on
Commit
74db9d2
·
verified ·
1 Parent(s): c919734

Update app.py

Browse files

Add permanent voice change

Files changed (1) hide show
  1. app.py +49 -28
app.py CHANGED
@@ -1,5 +1,6 @@
1
- ##fix overlap, remove silence, leave a tiny bit of silence
2
  ## Simplified
 
3
 
4
  import spaces
5
  import gradio as gr
@@ -62,8 +63,9 @@ async def get_voices():
62
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
63
 
64
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
65
- """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
66
- print(f"Text: {text_segment}") #Debug
 
67
  voice_map = {
68
  "1F": ("en-GB-SoniaNeural", 25, 0),
69
  "2F": ("en-US-JennyNeural", 0, 0),
@@ -73,48 +75,68 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
73
  "2M": ("en-GB-RyanNeural", 0, 0),
74
  "3M": ("en-US-BrianMultilingualNeural", 0, 0),
75
  "4M": ("en-GB-ThomasNeural", 0, 0),
76
- "1O": ("en-GB-RyanNeural", -20, -10),
77
- "1C": ("en-GB-MaisieNeural", 0, 0),
78
  "1V": ("vi-VN-HoaiMyNeural", 0, 0),
79
  "2V": ("vi-VN-NamMinhNeural", 0, 0),
80
  "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
81
  "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
82
  }
83
 
 
84
  current_voice_full = default_voice
85
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
86
  current_rate = rate
87
  current_pitch = pitch
88
  processed_text = text_segment.strip()
89
 
90
- detect = False
91
-
92
- prefix = processed_text[:2]
93
- if prefix in voice_map:
94
- current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
95
- current_pitch += pitch_adj
96
- current_rate += rate_adj
97
- detect = True
98
 
99
- match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
100
- if match:
101
- group = match.group()
102
- prefix_only = ''.join(filter(str.isalpha, group))
103
- number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
104
- current_pitch += number
105
- processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
106
- processed_text = processed_text[len(prefix_only):].strip()
107
- elif detect:
108
- processed_text = processed_text[2:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- if processed_text:
111
  rate_str = f"{current_rate:+d}%"
112
  pitch_str = f"{current_pitch:+d}Hz"
113
 
114
- # Retry logic
115
  for attempt in range(3):
116
  try:
117
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
118
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
119
  audio_path = tmp_file.name
120
  await communicate.save(audio_path)
@@ -126,14 +148,13 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
126
  audio.export(stripped_path, format="mp3")
127
  return stripped_path
128
  except Exception as e:
129
- print(f"Edge TTS Failed# {attempt}:: {e}") #Debug
130
  if attempt == 2:
131
  # Final failure: return 500ms of silence
132
  silent_audio = AudioSegment.silent(duration=500)
133
  fallback_path = tempfile.mktemp(suffix=".mp3")
134
  silent_audio.export(fallback_path, format="mp3")
135
  return fallback_path
136
- await asyncio.sleep(0.5) # brief wait before retry
137
 
138
  return None
139
 
 
1
+ ## fix overlap, remove silence, leave a tiny bit of silence
2
  ## Simplified
3
+ ## Permanent voice change implemented
4
 
5
  import spaces
6
  import gradio as gr
 
63
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
64
 
65
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
66
+ """Generates audio for a text segment, handling permanent and temporary voice changes with new rules."""
67
+
68
+ # Define the voice map for reference
69
  voice_map = {
70
  "1F": ("en-GB-SoniaNeural", 25, 0),
71
  "2F": ("en-US-JennyNeural", 0, 0),
 
75
  "2M": ("en-GB-RyanNeural", 0, 0),
76
  "3M": ("en-US-BrianMultilingualNeural", 0, 0),
77
  "4M": ("en-GB-ThomasNeural", 0, 0),
78
+ "1O": ("en-GB-RyanNeural", -20, -10), # Old man
79
+ "1C": ("en-GB-MaisieNeural", 0, 0), # Child
80
  "1V": ("vi-VN-HoaiMyNeural", 0, 0),
81
  "2V": ("vi-VN-NamMinhNeural", 0, 0),
82
  "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
83
  "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
84
  }
85
 
86
+ # Initialize current voice and processing variables
87
  current_voice_full = default_voice
88
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
89
  current_rate = rate
90
  current_pitch = pitch
91
  processed_text = text_segment.strip()
92
 
93
+ # Track permanent voice and temporary changes
94
+ permanent_voice = current_voice_short
95
+ temp_voice = None
 
 
 
 
 
96
 
97
+ # We'll process the text and adjust voices accordingly
98
+ result = []
99
+ idx = 0
100
+ while idx < len(processed_text):
101
+ # Detect potential voice change
102
+ match = re.match(r"(1F|2F|3F|4F|1M|2M|3M|4M|1O|1C|1V|2V|3V|4V)(P?)(-?\d+)?", processed_text[idx:])
103
+
104
+ if match:
105
+ prefix = match.group(1)
106
+ permanent_flag = match.group(2) == 'P' # Check if it's a permanent change
107
+ pitch_modifier = match.group(3) # This will be None or a number
108
+
109
+ if permanent_flag:
110
+ # Permanent voice change (e.g., "4VP")
111
+ permanent_voice, pitch_adj, rate_adj = voice_map[prefix]
112
+ current_pitch += pitch_adj
113
+ current_rate += rate_adj
114
+ result.append(f"<perm>{prefix}P") # Mark as permanent change
115
+ elif pitch_modifier:
116
+ # Temporary pitch adjustment (e.g., "4V-10" or "4V+5")
117
+ pitch_adjustment = int(pitch_modifier)
118
+ current_pitch += pitch_adjustment
119
+ result.append(f"<temp>{prefix}{pitch_modifier}") # Mark as temporary change
120
+
121
+ # Move index forward past the match
122
+ idx += len(match.group(0))
123
+ continue
124
+
125
+ # If no match, just add the normal text character
126
+ result.append(processed_text[idx])
127
+ idx += 1
128
+
129
+ # Rebuild the text with permanent and temporary voice marks
130
+ final_processed_text = ''.join(result).strip()
131
 
132
+ if final_processed_text:
133
  rate_str = f"{current_rate:+d}%"
134
  pitch_str = f"{current_pitch:+d}Hz"
135
 
136
+ # Retry logic for TTS
137
  for attempt in range(3):
138
  try:
139
+ communicate = edge_tts.Communicate(final_processed_text, permanent_voice, rate=rate_str, pitch=pitch_str)
140
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
141
  audio_path = tmp_file.name
142
  await communicate.save(audio_path)
 
148
  audio.export(stripped_path, format="mp3")
149
  return stripped_path
150
  except Exception as e:
 
151
  if attempt == 2:
152
  # Final failure: return 500ms of silence
153
  silent_audio = AudioSegment.silent(duration=500)
154
  fallback_path = tempfile.mktemp(suffix=".mp3")
155
  silent_audio.export(fallback_path, format="mp3")
156
  return fallback_path
157
+ await asyncio.sleep(0.5) # Retry after brief pause
158
 
159
  return None
160