milwright commited on
Commit
af54f4b
·
verified ·
1 Parent(s): aaa7d4e

Upload 4 files

Browse files
Files changed (2) hide show
  1. app.py +54 -40
  2. config.json +3 -3
app.py CHANGED
@@ -26,15 +26,15 @@ DEFAULT_CONFIG = {
26
  'max_tokens': 250,
27
  'model': 'google/gemma-3-27b-it',
28
  'api_key_var': 'API_KEY',
29
- 'theme': 'Default',
30
  'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
31
  'enable_dynamic_urls': True,
32
  'enable_file_upload': True,
33
  'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
34
  'language': 'Italian',
35
  'enable_tts': True,
36
- 'tts_model': 'facebook/fastspeech2-en-ljspeech',
37
- 'tts_voice': 'default',
38
  'locked': False
39
  }
40
 
@@ -533,51 +533,66 @@ def verify_hf_token_access() -> Tuple[bool, str]:
533
 
534
 
535
  def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
536
- """Generate TTS audio using HuggingFace Inference API"""
537
  if not ENABLE_TTS or not text:
538
  return None, "TTS disabled or no text provided"
539
 
540
- hf_token = os.getenv("HF_TOKEN")
541
- if not hf_token:
542
- return None, "⚠️ HF_TOKEN not configured for TTS"
543
 
544
  # Limit text length for TTS
545
- text = text[:500]
546
 
547
- # Prepare payload - most models just need the text
548
- payload = {"inputs": text}
 
549
 
550
  for attempt in range(max_retries):
551
  try:
552
- headers = {"Authorization": f"Bearer {hf_token}"}
553
- api_url = f"https://api-inference.huggingface.co/models/{TTS_MODEL}"
 
 
 
 
554
 
 
 
 
 
 
 
 
 
 
 
555
 
556
  response = requests.post(
557
  api_url,
558
  headers=headers,
559
  json=payload,
560
- timeout=20
561
  )
562
 
563
  if response.status_code == 200:
564
- # Convert audio bytes to numpy array
565
- audio_array = np.frombuffer(response.content, dtype=np.int16)
566
- # Most TTS models output at 16kHz
567
- sample_rate = 16000
568
- return (sample_rate, audio_array), "✅ Audio generated successfully"
569
-
570
- elif response.status_code == 503:
571
- # Model is loading
572
- if attempt < max_retries - 1:
573
- time.sleep(20) # Wait for model to load
574
- continue
575
- else:
576
- return None, " Model is loading, please try again in a moment"
577
 
578
  else:
579
  try:
580
- error_msg = response.json().get('error', 'Unknown error')
581
  except:
582
  error_msg = response.text if response.text else 'Unknown error'
583
  return None, f"❌ API Error ({response.status_code}): {error_msg}"
@@ -712,11 +727,11 @@ def create_interface():
712
  if not last_message:
713
  return None, gr.update(visible=False), gr.update(value="⚠️ No message to read", visible=True)
714
 
715
- audio_data, status_msg = generate_tts(last_message)
716
 
717
- if audio_data:
718
  return (
719
- audio_data,
720
  gr.update(visible=True),
721
  gr.update(value=status_msg, visible=True)
722
  )
@@ -992,20 +1007,19 @@ def create_interface():
992
  info="Enable text-to-speech for assistant responses"
993
  )
994
  edit_tts_model = gr.Dropdown(
995
- label="TTS Model",
996
  choices=[
997
- "facebook/fastspeech2-en-ljspeech",
998
- "facebook/mms-tts-eng",
999
- "espnet/kan-bayashi_ljspeech_vits",
1000
- "microsoft/speecht5_tts"
1001
  ],
1002
- value=config.get('tts_model', 'facebook/fastspeech2-en-ljspeech'),
1003
  allow_custom_value=True
1004
  )
1005
  edit_tts_voice = gr.Dropdown(
1006
  label="Voice",
1007
- choices=["default", "female", "male", "neutral"],
1008
- value=config.get('tts_voice', 'default')
 
1009
  )
1010
 
1011
  # Configuration actions
@@ -1095,8 +1109,8 @@ def create_interface():
1095
  DEFAULT_CONFIG['enable_dynamic_urls'],
1096
  DEFAULT_CONFIG['enable_file_upload'],
1097
  DEFAULT_CONFIG.get('enable_tts', False),
1098
- DEFAULT_CONFIG.get('tts_model', 'microsoft/speecht5_tts'),
1099
- DEFAULT_CONFIG.get('tts_voice', 'default'),
1100
  "✅ Reset to default configuration"
1101
  )
1102
  else:
 
26
  'max_tokens': 250,
27
  'model': 'google/gemma-3-27b-it',
28
  'api_key_var': 'API_KEY',
29
+ 'theme': 'Base',
30
  'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
31
  'enable_dynamic_urls': True,
32
  'enable_file_upload': True,
33
  'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
34
  'language': 'Italian',
35
  'enable_tts': True,
36
+ 'tts_model': 'openai/tts-1-hd',
37
+ 'tts_voice': 'onyx',
38
  'locked': False
39
  }
40
 
 
533
 
534
 
535
  def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
536
+ """Generate TTS audio using OpenAI's TTS API through OpenRouter"""
537
  if not ENABLE_TTS or not text:
538
  return None, "TTS disabled or no text provided"
539
 
540
+ api_key = os.getenv(API_KEY_VAR)
541
+ if not api_key:
542
+ return None, f"⚠️ {API_KEY_VAR} not configured for TTS"
543
 
544
  # Limit text length for TTS
545
+ text = text[:1000] # OpenAI supports up to 4096 chars but let's be reasonable
546
 
547
+ # OpenAI TTS models and voices
548
+ model = TTS_MODEL if TTS_MODEL.startswith("openai/") else "openai/tts-1"
549
+ voice = TTS_VOICE if TTS_VOICE in ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] else "alloy"
550
 
551
  for attempt in range(max_retries):
552
  try:
553
+ headers = {
554
+ "Authorization": f"Bearer {api_key}",
555
+ "HTTP-Referer": "https://huggingface.co",
556
+ "X-Title": SPACE_NAME,
557
+ "Content-Type": "application/json"
558
+ }
559
 
560
+ # OpenRouter endpoint for OpenAI TTS
561
+ api_url = "https://openrouter.ai/api/v1/audio/speech"
562
+
563
+ payload = {
564
+ "model": model,
565
+ "input": text,
566
+ "voice": voice,
567
+ "response_format": "mp3", # Can be mp3, opus, aac, flac
568
+ "speed": 1.0 # 0.25 to 4.0
569
+ }
570
 
571
  response = requests.post(
572
  api_url,
573
  headers=headers,
574
  json=payload,
575
+ timeout=30
576
  )
577
 
578
  if response.status_code == 200:
579
+ # OpenAI returns MP3 audio data
580
+ # Convert to format Gradio expects
581
+ try:
582
+ # Save temporarily and load with a library that can read MP3
583
+ import tempfile
584
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
585
+ tmp_file.write(response.content)
586
+ tmp_path = tmp_file.name
587
+
588
+ # For now, return the file path - Gradio can handle MP3 files
589
+ return tmp_path, "✅ Audio generated successfully"
590
+ except Exception as e:
591
+ return None, f" Error processing audio: {str(e)}"
592
 
593
  else:
594
  try:
595
+ error_msg = response.json().get('error', {}).get('message', 'Unknown error')
596
  except:
597
  error_msg = response.text if response.text else 'Unknown error'
598
  return None, f"❌ API Error ({response.status_code}): {error_msg}"
 
727
  if not last_message:
728
  return None, gr.update(visible=False), gr.update(value="⚠️ No message to read", visible=True)
729
 
730
+ audio_file, status_msg = generate_tts(last_message)
731
 
732
+ if audio_file:
733
  return (
734
+ audio_file, # File path for Gradio to play
735
  gr.update(visible=True),
736
  gr.update(value=status_msg, visible=True)
737
  )
 
1007
  info="Enable text-to-speech for assistant responses"
1008
  )
1009
  edit_tts_model = gr.Dropdown(
1010
+ label="TTS Model",
1011
  choices=[
1012
+ "openai/tts-1",
1013
+ "openai/tts-1-hd"
 
 
1014
  ],
1015
+ value=config.get('tts_model', 'openai/tts-1'),
1016
  allow_custom_value=True
1017
  )
1018
  edit_tts_voice = gr.Dropdown(
1019
  label="Voice",
1020
+ choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
1021
+ value=config.get('tts_voice', 'alloy'),
1022
+ info="alloy: neutral, echo: male, fable: british male, onyx: deep male, nova: female, shimmer: female"
1023
  )
1024
 
1025
  # Configuration actions
 
1109
  DEFAULT_CONFIG['enable_dynamic_urls'],
1110
  DEFAULT_CONFIG['enable_file_upload'],
1111
  DEFAULT_CONFIG.get('enable_tts', False),
1112
+ DEFAULT_CONFIG.get('tts_model', 'openai/tts-1'),
1113
+ DEFAULT_CONFIG.get('tts_voice', 'alloy'),
1114
  "✅ Reset to default configuration"
1115
  )
1116
  else:
config.json CHANGED
@@ -21,7 +21,7 @@
21
  "enable_dynamic_urls": true,
22
  "enable_file_upload": true,
23
  "enable_tts": true,
24
- "tts_model": "facebook/fastspeech2-en-ljspeech",
25
- "tts_voice": "default",
26
- "theme": "Default"
27
  }
 
21
  "enable_dynamic_urls": true,
22
  "enable_file_upload": true,
23
  "enable_tts": true,
24
+ "tts_model": "openai/tts-1-hd",
25
+ "tts_voice": "onyx",
26
+ "theme": "Base"
27
  }