mrrtmob commited on
Commit
c0c6352
·
1 Parent(s): 5193c5e

Update requirements to include necessary dependencies

Browse files
Files changed (1) hide show
  1. app.py +185 -75
app.py CHANGED
@@ -4,65 +4,83 @@ from snac import SNAC
4
  import torch
5
  import gradio as gr
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
- from huggingface_hub import snapshot_download, login
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
  # Get HF token from environment variables
13
  hf_token = os.getenv("HF_TOKEN")
 
 
 
 
 
14
  if hf_token:
15
  login(token=hf_token)
16
- print("Successfully logged in to Hugging Face")
 
 
 
 
 
 
17
  else:
18
  print("Warning: HF_TOKEN not found in environment variables")
19
 
 
 
20
  # Check if CUDA is available
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
22
 
23
  print("Loading SNAC model...")
24
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
25
  snac_model = snac_model.to(device)
 
26
 
27
  model_name = "mrrtmob/tts-khm-kore"
28
 
 
29
  # Download only model config and safetensors with token
30
  snapshot_download(
31
  repo_id=model_name,
32
- token=hf_token, # Add token here
33
  allow_patterns=[
34
  "config.json",
35
  "*.safetensors",
36
  "model.safetensors.index.json",
 
 
 
 
 
37
  ],
38
  ignore_patterns=[
39
  "optimizer.pt",
40
  "pytorch_model.bin",
41
  "training_args.bin",
42
- "scheduler.pt",
43
- "tokenizer.json",
44
- "tokenizer_config.json",
45
- "special_tokens_map.json",
46
- "vocab.json",
47
- "merges.txt",
48
- "tokenizer.*"
49
  ]
50
  )
 
51
 
 
52
  # Load model and tokenizer with token
53
  model = AutoModelForCausalLM.from_pretrained(
54
  model_name,
55
  torch_dtype=torch.bfloat16,
56
- token=hf_token # Add token here
 
57
  )
58
- model.to(device)
59
 
 
60
  tokenizer = AutoTokenizer.from_pretrained(
61
  model_name,
62
- token=hf_token # Add token here
63
  )
64
 
65
- print(f"Khmer TTS model loaded to {device}")
66
 
67
  # Process text prompt
68
  def process_prompt(prompt, voice, tokenizer, device):
@@ -96,22 +114,37 @@ def parse_output(generated_ids):
96
  trimmed_row = row[:new_length]
97
  trimmed_row = [t - 128266 for t in trimmed_row]
98
  code_lists.append(trimmed_row)
99
- return code_lists[0] # Return just the first one for single sample
100
 
101
  # Redistribute codes for audio generation
102
  def redistribute_codes(code_list, snac_model):
 
 
 
103
  device = next(snac_model.parameters()).device # Get the device of SNAC model
104
  layer_1 = []
105
  layer_2 = []
106
  layer_3 = []
 
107
  for i in range((len(code_list)+1)//7):
108
- layer_1.append(code_list[7*i])
109
- layer_2.append(code_list[7*i+1]-4096)
110
- layer_3.append(code_list[7*i+2]-(2*4096))
111
- layer_3.append(code_list[7*i+3]-(3*4096))
112
- layer_2.append(code_list[7*i+4]-(4*4096))
113
- layer_3.append(code_list[7*i+5]-(5*4096))
114
- layer_3.append(code_list[7*i+6]-(6*4096))
 
 
 
 
 
 
 
 
 
 
 
115
  # Move tensors to the same device as the SNAC model
116
  codes = [
117
  torch.tensor(layer_1, device=device).unsqueeze(0),
@@ -122,13 +155,18 @@ def redistribute_codes(code_list, snac_model):
122
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
123
 
124
  # Main generation function
125
- @spaces.GPU()
126
  def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
127
  if not text.strip():
 
128
  return None
 
129
  try:
130
  progress(0.1, "Processing text...")
 
 
131
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
 
132
  progress(0.3, "Generating speech tokens...")
133
  with torch.no_grad():
134
  generated_ids = model.generate(
@@ -141,27 +179,43 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
141
  repetition_penalty=repetition_penalty,
142
  num_return_sequences=1,
143
  eos_token_id=128258,
 
144
  )
 
145
  progress(0.6, "Processing speech tokens...")
146
  code_list = parse_output(generated_ids)
 
 
 
 
 
147
  progress(0.8, "Converting to audio...")
148
  audio_samples = redistribute_codes(code_list, snac_model)
 
 
 
 
 
 
149
  return (24000, audio_samples) # Return sample rate and audio
 
150
  except Exception as e:
151
- print(f"Error generating speech: {e}")
 
 
152
  return None
153
 
154
  # Examples for the UI - Khmer text examples
155
  examples = [
156
  ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
157
  ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
158
- ["ម្សិលមិញ ខ្ញុំឃើញឆ្មាមួយក្បាលដេញចាប់កន្ទុយខ្លួនឯង។ <laugh> វាគួរឲ្យអស់សំណើចណាស់។"], # Yesterday, I saw a cat chasing its own tail. <laugh> It was so funny.
159
- ["ខ្ញុំរៀបចំម្ហូប ស្រាប់តែធ្វើជ្រុះគ្រឿងទេសពេញឥដ្ឋ។ <chuckle> វាប្រឡាក់អស់ហើយ។"], # I was preparing food when suddenly I dropped spices all over the floor. <chuckle> It's all messed up.
160
- ["ថ្ងៃនេះហត់ណាស់ ធ្វើការពេញមួយថ្ងៃ។ <sigh> ចង់ទៅផ្ទះសម្រាកហើយ។"], # So tired today, worked all day. <sigh> Want to go home and rest now.
161
- ["អាកាសធាតុត្រជាក់ ធ្វើឲ្យខ្ញុំផ្តាសាយតិចៗ។ <sniffle> ខ្ញុំក៏ក្អកដែរ។ <cough>"], # The cold weather made me get a bit of a cold. <sniffle> I also cough. <cough>
162
- ["ការប្រឡងមិនបានល្អដូចការរំពឹងទុកទេ។ <groan> ខ្ញុំត្រូវរៀនឲ្យខ្លាំងជាងនេះ។"], # The exam didn't go as well as expected. <groan> I need to study harder.
163
- ["កិច្ចប្រជុំនេះវែងអន្លាយពេកហើយ។ <yawn> ខ្ញុំចាប់ផ្តើមងងុយគេងហើយ។"], # This meeting is too long. <yawn> I'm starting to get sleepy.
164
- ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"], # I was walking to buy something when suddenly I saw a big dog running towards me. <gasp> I was so scared! But then it ran away. <sigh>
165
  ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
166
  ]
167
 
@@ -171,57 +225,91 @@ examples = [
171
  # Available Emotive Tags
172
  EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  # Create Gradio interface
175
- with gr.Blocks(title="Khmer Text-to-Speech") as demo:
176
  gr.Markdown(f"""
 
 
177
  # 🎵 Khmer Text-to-Speech
178
  **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
 
179
  បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
 
180
  💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
 
 
181
  """)
182
- text_input = gr.Textbox(
183
- label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
184
- placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
185
- lines=4
186
- )
187
- # Voice selector (commented out)
188
- # voice = gr.Dropdown(
189
- # choices=VOICES,
190
- # value="tara",
191
- # label="Voice (សំលេង)"
192
- # )
193
- # Advanced Settings
194
- with gr.Accordion("🔧 Advanced Settings", open=False):
195
- with gr.Row():
196
- temperature = gr.Slider(
197
- minimum=0.1, maximum=1.5, value=0.6, step=0.05,
198
- label="Temperature",
199
- info="Higher values create more expressive speech"
200
- )
201
- top_p = gr.Slider(
202
- minimum=0.1, maximum=1.0, value=0.95, step=0.05,
203
- label="Top P",
204
- info="Nucleus sampling threshold"
205
- )
206
- with gr.Row():
207
- repetition_penalty = gr.Slider(
208
- minimum=1.0, maximum=2.0, value=1.1, step=0.05,
209
- label="Repetition Penalty",
210
- info="Higher values discourage repetitive patterns"
211
  )
212
- max_new_tokens = gr.Slider(
213
- minimum=100, maximum=2000, value=1200, step=100,
214
- label="Max Length",
215
- info="Maximum length of generated audio"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  )
217
- with gr.Row():
218
- submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
219
- clear_btn = gr.Button("🗑️ Clear", size="lg")
220
- audio_output = gr.Audio(
221
- label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
222
- type="numpy",
223
- show_label=True
224
- )
225
  # Set up examples (NO CACHE)
226
  gr.Examples(
227
  examples=examples,
@@ -229,19 +317,41 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
229
  outputs=audio_output,
230
  fn=lambda text: generate_speech(text),
231
  cache_examples=False,
 
232
  )
 
233
  # Set up event handlers
234
  submit_btn.click(
235
  fn=generate_speech,
236
  inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
237
- outputs=audio_output
 
238
  )
 
239
  clear_btn.click(
240
  fn=lambda: (None, None),
241
  inputs=[],
242
  outputs=[text_input, audio_output]
243
  )
 
 
 
 
 
 
 
 
244
 
245
  # Launch the app
246
  if __name__ == "__main__":
247
- demo.queue().launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
4
  import torch
5
  import gradio as gr
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ from huggingface_hub import snapshot_download, login, whoami
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
  # Get HF token from environment variables
13
  hf_token = os.getenv("HF_TOKEN")
14
+
15
+ # Debug and authentication
16
+ print("=== DEBUG INFO ===")
17
+ print(f"HF_TOKEN exists: {bool(hf_token)}")
18
+
19
  if hf_token:
20
  login(token=hf_token)
21
+ try:
22
+ user_info = whoami(token=hf_token)
23
+ print(f"Successfully logged in as: {user_info.get('name', 'Unknown')}")
24
+ print(f"User type: {user_info.get('type', 'Unknown')}")
25
+ print(f"User ID: {user_info.get('id', 'Unknown')}")
26
+ except Exception as e:
27
+ print(f"Authentication error: {e}")
28
  else:
29
  print("Warning: HF_TOKEN not found in environment variables")
30
 
31
+ print("=== END DEBUG ===")
32
+
33
  # Check if CUDA is available
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
+ print(f"Using device: {device}")
36
 
37
  print("Loading SNAC model...")
38
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
39
  snac_model = snac_model.to(device)
40
+ print("SNAC model loaded successfully")
41
 
42
  model_name = "mrrtmob/tts-khm-kore"
43
 
44
+ print(f"Downloading model files from {model_name}...")
45
  # Download only model config and safetensors with token
46
  snapshot_download(
47
  repo_id=model_name,
48
+ token=hf_token,
49
  allow_patterns=[
50
  "config.json",
51
  "*.safetensors",
52
  "model.safetensors.index.json",
53
+ "tokenizer.json",
54
+ "tokenizer_config.json",
55
+ "special_tokens_map.json",
56
+ "vocab.json",
57
+ "merges.txt"
58
  ],
59
  ignore_patterns=[
60
  "optimizer.pt",
61
  "pytorch_model.bin",
62
  "training_args.bin",
63
+ "scheduler.pt"
 
 
 
 
 
 
64
  ]
65
  )
66
+ print("Model files downloaded successfully")
67
 
68
+ print("Loading main model...")
69
  # Load model and tokenizer with token
70
  model = AutoModelForCausalLM.from_pretrained(
71
  model_name,
72
  torch_dtype=torch.bfloat16,
73
+ token=hf_token,
74
+ device_map="auto"
75
  )
 
76
 
77
+ print("Loading tokenizer...")
78
  tokenizer = AutoTokenizer.from_pretrained(
79
  model_name,
80
+ token=hf_token
81
  )
82
 
83
+ print(f"Khmer TTS model loaded successfully to {device}")
84
 
85
  # Process text prompt
86
  def process_prompt(prompt, voice, tokenizer, device):
 
114
  trimmed_row = row[:new_length]
115
  trimmed_row = [t - 128266 for t in trimmed_row]
116
  code_lists.append(trimmed_row)
117
+ return code_lists[0] if code_lists else [] # Return just the first one for single sample
118
 
119
  # Redistribute codes for audio generation
120
  def redistribute_codes(code_list, snac_model):
121
+ if not code_list:
122
+ return None
123
+
124
  device = next(snac_model.parameters()).device # Get the device of SNAC model
125
  layer_1 = []
126
  layer_2 = []
127
  layer_3 = []
128
+
129
  for i in range((len(code_list)+1)//7):
130
+ if 7*i < len(code_list):
131
+ layer_1.append(code_list[7*i])
132
+ if 7*i+1 < len(code_list):
133
+ layer_2.append(code_list[7*i+1]-4096)
134
+ if 7*i+2 < len(code_list):
135
+ layer_3.append(code_list[7*i+2]-(2*4096))
136
+ if 7*i+3 < len(code_list):
137
+ layer_3.append(code_list[7*i+3]-(3*4096))
138
+ if 7*i+4 < len(code_list):
139
+ layer_2.append(code_list[7*i+4]-(4*4096))
140
+ if 7*i+5 < len(code_list):
141
+ layer_3.append(code_list[7*i+5]-(5*4096))
142
+ if 7*i+6 < len(code_list):
143
+ layer_3.append(code_list[7*i+6]-(6*4096))
144
+
145
+ if not layer_1:
146
+ return None
147
+
148
  # Move tensors to the same device as the SNAC model
149
  codes = [
150
  torch.tensor(layer_1, device=device).unsqueeze(0),
 
155
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
156
 
157
  # Main generation function
158
+ @spaces.GPU(duration=120)
159
  def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
160
  if not text.strip():
161
+ gr.Warning("Please enter some text to generate speech.")
162
  return None
163
+
164
  try:
165
  progress(0.1, "Processing text...")
166
+ print(f"Generating speech for text: {text[:50]}...")
167
+
168
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
169
+
170
  progress(0.3, "Generating speech tokens...")
171
  with torch.no_grad():
172
  generated_ids = model.generate(
 
179
  repetition_penalty=repetition_penalty,
180
  num_return_sequences=1,
181
  eos_token_id=128258,
182
+ pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id
183
  )
184
+
185
  progress(0.6, "Processing speech tokens...")
186
  code_list = parse_output(generated_ids)
187
+
188
+ if not code_list:
189
+ gr.Warning("Failed to generate valid audio codes.")
190
+ return None
191
+
192
  progress(0.8, "Converting to audio...")
193
  audio_samples = redistribute_codes(code_list, snac_model)
194
+
195
+ if audio_samples is None:
196
+ gr.Warning("Failed to convert codes to audio.")
197
+ return None
198
+
199
+ print("Speech generation completed successfully")
200
  return (24000, audio_samples) # Return sample rate and audio
201
+
202
  except Exception as e:
203
+ error_msg = f"Error generating speech: {str(e)}"
204
+ print(error_msg)
205
+ gr.Error(error_msg)
206
  return None
207
 
208
  # Examples for the UI - Khmer text examples
209
  examples = [
210
  ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំជា AI ដែលអាចបម្លែងអត្ថបទទៅជាសំលេង។"],
211
  ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច។"],
212
+ ["ម្សិលមិញ ខ្ញុំឃើញឆ្មាមួយក្បាលដេញចាប់កន្ទុយខ្លួនឯង។ <laugh> វាគួរឲ្យអស់សំណើចណាស់។"],
213
+ ["ខ្ញុំរៀបចំម្ហូប ស្រាប់តែធ្វើជ្រុះគ្រឿងទេសពេញឥដ្ឋ។ <chuckle> វាប្រឡាក់អស់ហើយ។"],
214
+ ["ថ្ងៃនេះហត់ណាស់ ធ្វើការពេញមួយថ្ងៃ។ <sigh> ចង់ទៅផ្ទះសម្រាកហើយ។"],
215
+ ["អាកាសធាតុត្រជាក់ ធ្វើឲ្យខ្ញុំផ្តាសាយតិចៗ។ <sniffle> ខ្ញុំក៏ក្អកដែរ។ <cough>"],
216
+ ["ការប្រឡងមិនបានល្អដូចការរំពឹងទុកទេ។ <groan> ខ្ញុំត្រូវរៀនឲ្យខ្លាំងជាងនេះ។"],
217
+ ["កិច្���ប្រជុំនេះវែងអន្លាយពេកហើយ។ <yawn> ខ្ញុំចាប់ផ្តើមងងុយគេងហើយ។"],
218
+ ["ខ្ញុំដើរទៅទិញអីញ៉ាំ ស្រាប់តែឃើញឆ្កែធំមួយរត់មករកខ្ញុំ។ <gasp> ខ្ញុំភ័យណាស់! តែវារត់ទៅបាត់វិញ។ <sigh>"],
219
  ["អរគុណច្រើនសម្រាប់ជំនួយ។ <chuckle> បើគ្មានអ្នកទេ ខ្ញុំមិនដឹងធ្វើយ៉ាងម៉េចទេ។"],
220
  ]
221
 
 
225
  # Available Emotive Tags
226
  EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
227
 
228
+ # Create custom CSS
229
+ css = """
230
+ .gradio-container {
231
+ max-width: 1200px;
232
+ margin: auto;
233
+ padding-top: 1.5rem;
234
+ }
235
+ .main-header {
236
+ text-align: center;
237
+ margin-bottom: 2rem;
238
+ }
239
+ .generate-btn {
240
+ background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
241
+ border: none !important;
242
+ color: white !important;
243
+ font-weight: bold !important;
244
+ }
245
+ .clear-btn {
246
+ background: linear-gradient(45deg, #95A5A6, #BDC3C7) !important;
247
+ border: none !important;
248
+ color: white !important;
249
+ }
250
+ """
251
+
252
  # Create Gradio interface
253
+ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as demo:
254
  gr.Markdown(f"""
255
+ <div class="main-header">
256
+
257
  # 🎵 Khmer Text-to-Speech
258
  **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
259
+
260
  បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
261
+
262
  💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
263
+
264
+ </div>
265
  """)
266
+
267
+ with gr.Row():
268
+ with gr.Column(scale=2):
269
+ text_input = gr.Textbox(
270
+ label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
271
+ placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
272
+ lines=4,
273
+ max_lines=8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  )
275
+
276
+ # Advanced Settings
277
+ with gr.Accordion("🔧 Advanced Settings", open=False):
278
+ with gr.Row():
279
+ temperature = gr.Slider(
280
+ minimum=0.1, maximum=1.5, value=0.6, step=0.05,
281
+ label="Temperature",
282
+ info="Higher values create more expressive speech"
283
+ )
284
+ top_p = gr.Slider(
285
+ minimum=0.1, maximum=1.0, value=0.95, step=0.05,
286
+ label="Top P",
287
+ info="Nucleus sampling threshold"
288
+ )
289
+ with gr.Row():
290
+ repetition_penalty = gr.Slider(
291
+ minimum=1.0, maximum=2.0, value=1.1, step=0.05,
292
+ label="Repetition Penalty",
293
+ info="Higher values discourage repetitive patterns"
294
+ )
295
+ max_new_tokens = gr.Slider(
296
+ minimum=100, maximum=2000, value=1200, step=100,
297
+ label="Max Length",
298
+ info="Maximum length of generated audio"
299
+ )
300
+
301
+ with gr.Row():
302
+ submit_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg", elem_classes=["generate-btn"])
303
+ clear_btn = gr.Button("🗑️ Clear", size="lg", elem_classes=["clear-btn"])
304
+
305
+ with gr.Column(scale=1):
306
+ audio_output = gr.Audio(
307
+ label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
308
+ type="numpy",
309
+ show_label=True,
310
+ interactive=False
311
  )
312
+
 
 
 
 
 
 
 
313
  # Set up examples (NO CACHE)
314
  gr.Examples(
315
  examples=examples,
 
317
  outputs=audio_output,
318
  fn=lambda text: generate_speech(text),
319
  cache_examples=False,
320
+ label="📝 Example Texts (អត្ថបទគំរូ)"
321
  )
322
+
323
  # Set up event handlers
324
  submit_btn.click(
325
  fn=generate_speech,
326
  inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
327
+ outputs=audio_output,
328
+ show_progress=True
329
  )
330
+
331
  clear_btn.click(
332
  fn=lambda: (None, None),
333
  inputs=[],
334
  outputs=[text_input, audio_output]
335
  )
336
+
337
+ # Add keyboard shortcut
338
+ text_input.submit(
339
+ fn=generate_speech,
340
+ inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
341
+ outputs=audio_output,
342
+ show_progress=True
343
+ )
344
 
345
  # Launch the app
346
  if __name__ == "__main__":
347
+ print("Starting Gradio interface...")
348
+ demo.queue(
349
+ max_size=20,
350
+ default_concurrency_limit=5
351
+ ).launch(
352
+ server_name="0.0.0.0",
353
+ server_port=7860,
354
+ share=False,
355
+ show_error=True,
356
+ quiet=False
357
+ )