cnph001 commited on
Commit
2f93aef
·
verified ·
1 Parent(s): 92f530c

adding silence by marker "SS##"

Browse files

Try adding silence by marker "SS##"

Files changed (1) hide show
  1. app.py +98 -68
app.py CHANGED
@@ -12,88 +12,117 @@ async def get_voices():
12
  voices = await edge_tts.list_voices()
13
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
14
 
15
- # Text-to-speech function for a single paragraph
16
  async def paragraph_to_speech(text, voice, rate, pitch):
17
  voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
18
  voice1F ="en-US-EmmaNeural - en-US (Female)"
19
  voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
20
- voice2F = "en-US-JennyNeural - en-US (Female)"
21
- voice1 = "en-AU-WilliamNeural - en-AU (Male)"
22
  voice3F = "en-HK-YanNeural - en-HK (Female)"
23
  voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
24
  voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
 
25
  if not text.strip():
26
- return None
27
- if text.startswith("1F"):
28
- text2 = text[2:] # Remove the first two characters ("FF")
29
- voice_short_name =voice1F.split(" - ")[0]
30
- elif text.startswith("2F"):
31
- text2 = text[2:] # Remove the first two characters ("FF")
32
- voice_short_name =voice2F.split(" - ")[0]
33
- elif text.startswith("3F"):
34
- text2 = text[2:] # Remove the first two characters ("FF")
35
- voice_short_name =voice3F.split(" - ")[0]
36
- elif text.startswith("1M"):
37
- text2 = text[2:] # Remove the first two characters ("FF")
38
- voice_short_name =voice1.split(" - ")[0]
39
- elif text.startswith("2M"):
40
- text2 = text[2:] # Remove the first two characters ("FF")
41
- voice_short_name =voice2.split(" - ")[0]
42
- elif text.startswith("3M"):
43
- text2 = text[2:] # Remove the first two characters ("FF")
44
- voice_short_name =voice3.split(" - ")[0]
45
- elif text.startswith("1C"):
46
- text2 = text[2:] # Remove the first two characters ("FF")
47
- voice_short_name =voice4.split(" - ")[0]
48
- elif text.startswith("1O"):
49
- text2 = text[2:] # Remove the first two characters ("FF")
50
- voice_short_name =voice5.split(" - ")[0]
51
- pitch = -30
52
- rate = -20
53
- else:
54
- # Use selected voice, or fallback to default
55
- voice_short_name = (voice or default_voice).split(" - ")[0]
56
- text2=text
57
- rate_str = f"{rate:+d}%"
58
- pitch_str = f"{pitch:+d}Hz"
59
- communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
60
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
61
- tmp_path = tmp_file.name
62
- await communicate.save(tmp_path)
63
- return tmp_path
64
-
65
- # Main text-to-speech function that processes paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  async def text_to_speech(text, voice, rate, pitch):
67
  if not text.strip():
68
  return None, gr.Warning("Please enter text to convert.")
69
  if not voice:
70
  return None, gr.Warning("Please select a voice.")
71
 
72
- # Split by two or more newline characters, optionally preceded by carriage returns
73
- #paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
74
  paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
 
75
 
76
- audio_files = []
77
  for paragraph in paragraphs:
78
- audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
79
- if audio_path:
80
- audio_files.append(audio_path)
81
-
82
- if not audio_files:
83
- return None, None # No audio generated
84
-
85
- # Combine audio files if there are multiple paragraphs
86
- if len(audio_files) == 1:
87
- return audio_files[0], None
88
- else:
89
- # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
90
- combined_audio_path = tempfile.mktemp(suffix=".mp3")
91
- with open(combined_audio_path, 'wb') as outfile:
92
- for filename in audio_files:
93
- with open(filename, 'rb') as infile:
94
- outfile.write(infile.read())
95
- os.remove(filename) # Clean up individual files
96
- return combined_audio_path, None
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Gradio interface function
99
  @spaces.GPU
@@ -109,6 +138,7 @@ async def create_demo():
109
  default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices
110
  description = """
111
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
 
112
  Enter your text, select a voice, and adjust the speech rate and pitch.
113
  The application will process your text paragraph by paragraph (separated by two blank lines).
114
  """
@@ -116,7 +146,7 @@ async def create_demo():
116
  demo = gr.Interface(
117
  fn=tts_interface,
118
  inputs=[
119
- gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
120
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
121
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
122
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -125,9 +155,9 @@ async def create_demo():
125
  gr.Audio(label="Generated Audio", type="filepath"),
126
  gr.Markdown(label="Warning", visible=False)
127
  ],
128
- title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
129
  description=description,
130
- article="Process text paragraph by paragraph for smoother output.",
131
  analytics_enabled=False,
132
  allow_flagging=False
133
  )
 
12
  voices = await edge_tts.list_voices()
13
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
14
 
15
+ # Text-to-speech function for a single paragraph with SS handling
16
  async def paragraph_to_speech(text, voice, rate, pitch):
17
  voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
18
  voice1F ="en-US-EmmaNeural - en-US (Female)"
19
  voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
20
+ voice2F = "en-US-JennyNeural - en-US (Female)"
21
+ voice1 = "en-AU-WilliamNeural - en-AU (Male)"
22
  voice3F = "en-HK-YanNeural - en-HK (Female)"
23
  voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
24
  voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
25
+
26
  if not text.strip():
27
+ return None, [] # Return None for audio path and empty list for silence
28
+
29
+ audio_segments = []
30
+ silence_durations = []
31
+ parts = re.split(r'(SS\d+\.?\d*)', text)
32
+
33
+ for part in parts:
34
+ if re.match(r'SS\d+\.?\d*', part):
35
+ try:
36
+ silence_duration = float(part[2:])
37
+ silence_durations.append(silence_duration)
38
+ audio_segments.append(None) # Placeholder for silence
39
+ except ValueError:
40
+ print(f"Warning: Invalid silence duration format: {part}")
41
+ elif part.strip():
42
+ processed_text = part
43
+ current_voice = voice
44
+ current_rate = rate
45
+ current_pitch = pitch
46
+
47
+ if part.startswith("1F"):
48
+ processed_text = part[2:]
49
+ current_voice = voice1F.split(" - ")[0]
50
+ elif part.startswith("2F"):
51
+ processed_text = part[2:]
52
+ current_voice = voice2F.split(" - ")[0]
53
+ elif part.startswith("3F"):
54
+ processed_text = part[2:]
55
+ current_voice = voice3F.split(" - ")[0]
56
+ elif part.startswith("1M"):
57
+ processed_text = part[2:]
58
+ current_voice = voice1.split(" - ")[0]
59
+ elif part.startswith("2M"):
60
+ processed_text = part[2:]
61
+ current_voice = voice2.split(" - ")[0]
62
+ elif part.startswith("3M"):
63
+ processed_text = part[2:]
64
+ current_voice = voice3.split(" - ")[0]
65
+ elif part.startswith("1C"):
66
+ processed_text = part[2:]
67
+ current_voice = voice4.split(" - ")[0]
68
+ elif part.startswith("1O"):
69
+ processed_text = part[2:]
70
+ current_voice = voice5.split(" - ")[0]
71
+ current_pitch = -30
72
+ current_rate = -20
73
+
74
+ rate_str = f"{current_rate:+d}%"
75
+ pitch_str = f"{current_pitch:+d}Hz"
76
+ communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
77
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
78
+ tmp_path = tmp_file.name
79
+ await communicate.save(tmp_path)
80
+ audio_segments.append(tmp_path)
81
+ else:
82
+ audio_segments.append(None) # Empty string
83
+
84
+ return audio_segments, silence_durations
85
+
86
+ # Main text-to-speech function that processes paragraphs and silence
87
  async def text_to_speech(text, voice, rate, pitch):
88
  if not text.strip():
89
  return None, gr.Warning("Please enter text to convert.")
90
  if not voice:
91
  return None, gr.Warning("Please select a voice.")
92
 
 
 
93
  paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
94
+ final_audio_segments = []
95
 
 
96
  for paragraph in paragraphs:
97
+ audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
98
+ if audio_paths:
99
+ for i, path in enumerate(audio_paths):
100
+ final_audio_segments.append(path)
101
+ if i < len(silence_times):
102
+ final_audio_segments.append(silence_times[i])
103
+
104
+ if not any(isinstance(item, str) for item in final_audio_segments):
105
+ return None, None # No actual audio generated
106
+
107
+ if all(not isinstance(item, str) for item in final_audio_segments):
108
+ return None, "Only silence markers found."
109
+
110
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
111
+ with open(combined_audio_path, 'wb') as outfile:
112
+ for segment in final_audio_segments:
113
+ if isinstance(segment, str):
114
+ try:
115
+ with open(segment, 'rb') as infile:
116
+ outfile.write(infile.read())
117
+ os.remove(segment) # Clean up individual files
118
+ except FileNotFoundError:
119
+ print(f"Warning: Audio file not found: {segment}")
120
+ elif isinstance(segment, (int, float)):
121
+ # Basic silence insertion (approximate)
122
+ silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
123
+ outfile.write(silence)
124
+
125
+ return combined_audio_path, None
126
 
127
  # Gradio interface function
128
  @spaces.GPU
 
138
  default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices
139
  description = """
140
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
141
+ You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
142
  Enter your text, select a voice, and adjust the speech rate and pitch.
143
  The application will process your text paragraph by paragraph (separated by two blank lines).
144
  """
 
146
  demo = gr.Interface(
147
  fn=tts_interface,
148
  inputs=[
149
+ gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
150
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
151
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
152
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
155
  gr.Audio(label="Generated Audio", type="filepath"),
156
  gr.Markdown(label="Warning", visible=False)
157
  ],
158
+ title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
159
  description=description,
160
+ article="Process text paragraph by paragraph for smoother output and insert silence markers.",
161
  analytics_enabled=False,
162
  allow_flagging=False
163
  )