cnph001 commited on
Commit
5021a0c
·
verified ·
1 Parent(s): 4337b98

Restored to previous working

Browse files

Restored to previous working
Changed to detect quotes instead of paragraph

Files changed (1) hide show
  1. app.py +92 -107
app.py CHANGED
@@ -1,145 +1,130 @@
 
 
 
1
  import asyncio
2
- import os
3
- import re
4
  import tempfile
5
- import edge_tts
6
- import gradio as gr
7
-
8
- # Default voice
9
- default_voice = "en-US-AndrewNeural - en-US (Male)"
10
 
11
- # Text-to-speech function for a single segment
12
- async def process_speech_segment(text, voice, rate, pitch):
13
- """
14
- Processes a single segment of text (either a quote or regular text)
15
- and generates speech using edge-tts.
16
 
17
- Args:
18
- text (str): The text to be converted to speech.
19
- voice (str): The voice to use (can be overridden by prefixes).
20
- rate (int): The speech rate.
21
- pitch (int): The speech pitch.
22
 
23
- Returns:
24
- str: The path to the generated audio file, or None on error.
25
- """
26
- voice1 = "en-US-AndrewNeural - en-US (Male)" # good for reading
27
- voice1F = "en-US-EmmaNeural - en-US (Female)"
28
  voice2 = "en-US-BrianNeural - en-US (Male)"
29
- voice2F = "en-US-JennyNeural - en-US (Female)"
30
- voice3 = "en-AU-WilliamNeural - en-AU (Male)"
31
  voice3F = "en-HK-YanNeural - en-HK (Female)"
32
- voice4 = "en-GB-MaisieNeural - en-GB (Female)" # Child
33
-
34
  if not text.strip():
35
  return None
36
-
37
- voice_short_name = voice.split(" - ")[0] #default
38
-
39
  if text.startswith("1F"):
40
- text2 = text[2:].strip()
41
- voice_short_name = voice1F.split(" - ")[0]
42
  elif text.startswith("2F"):
43
- text2 = text[2:].strip()
44
- voice_short_name = voice2F.split(" - ")[0]
45
  elif text.startswith("3F"):
46
- text2 = text[2:].strip()
47
- voice_short_name = voice3F.split(" - ")[0]
48
  elif text.startswith("1M"):
49
- text2 = text[2:].strip()
50
- voice_short_name = voice2.split(" - ")[0]
51
  elif text.startswith("2M"):
52
- text2 = text[2:].strip()
53
- voice_short_name = voice3.split(" - ")[0]
54
  elif text.startswith("1C"):
55
- text2 = text[2:].strip()
56
- voice_short_name = voice4.split(" - ")[0]
57
  else:
58
- text2 = text
59
-
 
60
  rate_str = f"{rate:+d}%"
61
  pitch_str = f"{pitch:+d}Hz"
62
- try:
63
- communicate = edge_tts.Communicate(text2, voice=voice_short_name, rate=rate_str, pitch=pitch_str) #removed async
64
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
65
- tmp_path = tmp_file.name
66
- await communicate.save(tmp_path) #added await
67
- return tmp_path
68
- except Exception as e:
69
- print(f"Error processing segment: {e}") # Log the error
70
- return None
71
 
72
- # Main text-to-speech function
73
  async def text_to_speech(text, voice, rate, pitch):
74
- """
75
- Processes the input text, identifying quoted sections for different voices,
76
- and generates combined audio.
77
-
78
- Args:
79
- text (str): The input text.
80
- voice (str): The default voice.
81
- rate (int): The speech rate.
82
- pitch (int): The speech pitch.
83
-
84
- Returns:
85
- tuple: (audio_path, error_message) where audio_path is the path to the
86
- combined audio file, and error_message is any error encountered.
87
- """
88
  if not text.strip():
89
  return None, gr.Warning("Please enter text to convert.")
90
  if not voice:
91
  return None, gr.Warning("Please select a voice.")
92
 
93
- audio_files = []
94
- segments = []
95
- i = 0
96
- while i < len(text):
97
- if text[i] == '"':
98
- # Find the closing quote
99
- j = i + 1
100
- while j < len(text) and text[j] != '"':
101
- j += 1
102
- if j < len(text):
103
- segments.append(("quote", text[i + 1:j]))
104
- i = j + 1
105
- else:
106
- segments.append(("text", text[i:])) # Handle unclosed quote
107
- i = j
108
- else:
109
- # Find the end of the non-quote text
110
- j = i + 1
111
- while j < len(text) and text[j] != '"':
112
- j += 1
113
- segments.append(("text", text[i:j]))
114
- i = j
115
 
116
- for segment_type, segment_text in segments:
117
- if segment_type == "quote":
118
- # Determine the voice based on the prefix within the quote.
119
- voice_prefix = ""
120
- if segment_text.startswith("1F") or segment_text.startswith("2F") or segment_text.startswith("3F") or segment_text.startswith("1M") or segment_text.startswith("2M") or segment_text.startswith("1C"):
121
- voice_prefix = segment_text[:2]
122
- audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
123
- else:
124
- audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
125
  if audio_path:
126
  audio_files.append(audio_path)
127
 
128
  if not audio_files:
129
- return None, None
130
 
 
131
  if len(audio_files) == 1:
132
  return audio_files[0], None
133
  else:
134
- # Combine audio files
135
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
136
  with open(combined_audio_path, 'wb') as outfile:
137
  for filename in audio_files:
138
- try:
139
- with open(filename, 'rb') as infile:
140
- outfile.write(infile.read())
141
- os.remove(filename) # Clean up individual files
142
- except Exception as e:
143
- print(f"Error combining audio files: {e}")
144
- return None, gr.Error(f"Error combining audio files: {e}")
145
  return combined_audio_path, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import edge_tts
4
  import asyncio
 
 
5
  import tempfile
6
+ import os
7
+ import re # Import the regular expression module
 
 
 
8
 
 
 
 
 
 
9
 
10
+ # Get all available voices
11
+ async def get_voices():
12
+ voices = await edge_tts.list_voices()
13
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
14
 
15
+ # Text-to-speech function for a single paragraph
16
+ async def paragraph_to_speech(text, voice, rate, pitch):
17
+ voice1 ="en-US-AndrewNeural - en-US (Male)" #good for reading
18
+ voice1F ="en-US-EmmaNeural - en-US (Female)"
 
19
  voice2 = "en-US-BrianNeural - en-US (Male)"
20
+ voice2F = "en-US-JennyNeural - en-US (Female)"
21
+ voice3 = "en-AU-WilliamNeural - en-AU (Male)"
22
  voice3F = "en-HK-YanNeural - en-HK (Female)"
23
+ voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
 
24
  if not text.strip():
25
  return None
 
 
 
26
  if text.startswith("1F"):
27
+ text2 = text[2:] # Remove the first two characters ("FF")
28
+ voice_short_name =voice1F.split(" - ")[0]
29
  elif text.startswith("2F"):
30
+ text2 = text[2:] # Remove the first two characters ("FF")
31
+ voice_short_name =voice2F.split(" - ")[0]
32
  elif text.startswith("3F"):
33
+ text2 = text[2:] # Remove the first two characters ("FF")
34
+ voice_short_name =voice3F.split(" - ")[0]
35
  elif text.startswith("1M"):
36
+ text2 = text[2:] # Remove the first two characters ("FF")
37
+ voice_short_name =voice2.split(" - ")[0]
38
  elif text.startswith("2M"):
39
+ text2 = text[2:] # Remove the first two characters ("FF")
40
+ voice_short_name =voice3.split(" - ")[0]
41
  elif text.startswith("1C"):
42
+ text2 = text[2:] # Remove the first two characters ("FF")
43
+ voice_short_name =voice4.split(" - ")[0]
44
  else:
45
+ # Use selected voice, or fallback to default
46
+ voice_short_name = (voice or default_voice).split(" - ")[0]
47
+ text2=text
48
  rate_str = f"{rate:+d}%"
49
  pitch_str = f"{pitch:+d}Hz"
50
+ communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
51
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
52
+ tmp_path = tmp_file.name
53
+ await communicate.save(tmp_path)
54
+ return tmp_path
 
 
 
 
55
 
56
+ # Main text-to-speech function that processes paragraphs
57
  async def text_to_speech(text, voice, rate, pitch):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if not text.strip():
59
  return None, gr.Warning("Please enter text to convert.")
60
  if not voice:
61
  return None, gr.Warning("Please select a voice.")
62
 
63
+ # Split by two or more newline characters, optionally preceded by carriage returns
64
+ #paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
65
+ paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ audio_files = []
68
+ for paragraph in paragraphs:
69
+ audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
 
 
 
 
 
 
70
  if audio_path:
71
  audio_files.append(audio_path)
72
 
73
  if not audio_files:
74
+ return None, None # No audio generated
75
 
76
+ # Combine audio files if there are multiple paragraphs
77
  if len(audio_files) == 1:
78
  return audio_files[0], None
79
  else:
80
+ # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
81
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
82
  with open(combined_audio_path, 'wb') as outfile:
83
  for filename in audio_files:
84
+ with open(filename, 'rb') as infile:
85
+ outfile.write(infile.read())
86
+ os.remove(filename) # Clean up individual files
 
 
 
 
87
  return combined_audio_path, None
88
+
89
+ # Gradio interface function
90
+ @spaces.GPU
91
+ def tts_interface(text, voice, rate, pitch):
92
+ audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
93
+ return audio, warning
94
+
95
+ # Create Gradio application
96
+ import gradio as gr
97
+
98
+ async def create_demo():
99
+ voices = await get_voices()
100
+ default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices
101
+ description = """
102
+ Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
103
+ Enter your text, select a voice, and adjust the speech rate and pitch.
104
+ The application will process your text paragraph by paragraph (separated by two blank lines).
105
+ """
106
+
107
+ demo = gr.Interface(
108
+ fn=tts_interface,
109
+ inputs=[
110
+ gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
111
+ gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
112
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
113
+ gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
114
+ ],
115
+ outputs=[
116
+ gr.Audio(label="Generated Audio", type="filepath"),
117
+ gr.Markdown(label="Warning", visible=False)
118
+ ],
119
+ title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
120
+ description=description,
121
+ article="Process text paragraph by paragraph for smoother output.",
122
+ analytics_enabled=False,
123
+ allow_flagging=False
124
+ )
125
+ return demo
126
+
127
+ # Run the application
128
+ if __name__ == "__main__":
129
+ demo = asyncio.run(create_demo())
130
+ demo.launch()