Spaces:

milwright
/

speech-buddy

Sleeping

App Files Files Community

milwright commited on 5 days ago

Commit

af54f4b

verified ·

1 Parent(s): aaa7d4e

Upload 4 files

Browse files

Files changed (2) hide show

app.py +54 -40
config.json +3 -3

app.py CHANGED Viewed

@@ -26,15 +26,15 @@ DEFAULT_CONFIG = {
     'max_tokens': 250,
     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
-    'theme': 'Default',
     'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
     'enable_tts': True,
-    'tts_model': 'facebook/fastspeech2-en-ljspeech',
-    'tts_voice': 'default',
     'locked': False
 }
@@ -533,51 +533,66 @@ def verify_hf_token_access() -> Tuple[bool, str]:
 def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
-    """Generate TTS audio using HuggingFace Inference API"""
     if not ENABLE_TTS or not text:
         return None, "TTS disabled or no text provided"
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        return None, "⚠️ HF_TOKEN not configured for TTS"
     # Limit text length for TTS
-    text = text[:500]
-    # Prepare payload - most models just need the text
-    payload = {"inputs": text}
     for attempt in range(max_retries):
         try:
-            headers = {"Authorization": f"Bearer {hf_token}"}
-            api_url = f"https://api-inference.huggingface.co/models/{TTS_MODEL}"
             response = requests.post(
                 api_url,
                 headers=headers,
                 json=payload,
-                timeout=20
             )
             if response.status_code == 200:
-                # Convert audio bytes to numpy array
-                audio_array = np.frombuffer(response.content, dtype=np.int16)
-                # Most TTS models output at 16kHz
-                sample_rate = 16000
-                return (sample_rate, audio_array), "✅ Audio generated successfully"
-            elif response.status_code == 503:
-                # Model is loading
-                if attempt < max_retries - 1:
-                    time.sleep(20)  # Wait for model to load
-                    continue
-                else:
-                    return None, "⏳ Model is loading, please try again in a moment"
             else:
                 try:
-                    error_msg = response.json().get('error', 'Unknown error')
                 except:
                     error_msg = response.text if response.text else 'Unknown error'
                 return None, f"❌ API Error ({response.status_code}): {error_msg}"
@@ -712,11 +727,11 @@ def create_interface():
                             if not last_message:
                                 return None, gr.update(visible=False), gr.update(value="⚠️ No message to read", visible=True)
-                            audio_data, status_msg = generate_tts(last_message)
-                            if audio_data:
                                 return (
-                                    audio_data,
                                     gr.update(visible=True),
                                     gr.update(value=status_msg, visible=True)
                                 )
@@ -992,20 +1007,19 @@ def create_interface():
                                 info="Enable text-to-speech for assistant responses"
                             )
                             edit_tts_model = gr.Dropdown(
-                                label="TTS Model",
                                 choices=[
-                                    "facebook/fastspeech2-en-ljspeech",
-                                    "facebook/mms-tts-eng",
-                                    "espnet/kan-bayashi_ljspeech_vits",
-                                    "microsoft/speecht5_tts"
                                 ],
-                                value=config.get('tts_model', 'facebook/fastspeech2-en-ljspeech'),
                                 allow_custom_value=True
                             )
                             edit_tts_voice = gr.Dropdown(
                                 label="Voice",
-                                choices=["default", "female", "male", "neutral"],
-                                value=config.get('tts_voice', 'default')
                             )
                         # Configuration actions
@@ -1095,8 +1109,8 @@ def create_interface():
                                         DEFAULT_CONFIG['enable_dynamic_urls'],
                                         DEFAULT_CONFIG['enable_file_upload'],
                                         DEFAULT_CONFIG.get('enable_tts', False),
-                                        DEFAULT_CONFIG.get('tts_model', 'microsoft/speecht5_tts'),
-                                        DEFAULT_CONFIG.get('tts_voice', 'default'),
                                         "✅ Reset to default configuration"
                                     )
                                 else:

     'max_tokens': 250,
     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
+    'theme': 'Base',
     'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
     'enable_tts': True,
+    'tts_model': 'openai/tts-1-hd',
+    'tts_voice': 'onyx',
     'locked': False
 }
 def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
+    """Generate TTS audio using OpenAI's TTS API through OpenRouter"""
     if not ENABLE_TTS or not text:
         return None, "TTS disabled or no text provided"
+    api_key = os.getenv(API_KEY_VAR)
+    if not api_key:
+        return None, f"⚠️ {API_KEY_VAR} not configured for TTS"
     # Limit text length for TTS
+    text = text[:1000]  # OpenAI supports up to 4096 chars but let's be reasonable
+    # OpenAI TTS models and voices
+    model = TTS_MODEL if TTS_MODEL.startswith("openai/") else "openai/tts-1"
+    voice = TTS_VOICE if TTS_VOICE in ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] else "alloy"
     for attempt in range(max_retries):
         try:
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "HTTP-Referer": "https://huggingface.co",
+                "X-Title": SPACE_NAME,
+                "Content-Type": "application/json"
+            }
+            # OpenRouter endpoint for OpenAI TTS
+            api_url = "https://openrouter.ai/api/v1/audio/speech"
+            payload = {
+                "model": model,
+                "input": text,
+                "voice": voice,
+                "response_format": "mp3",  # Can be mp3, opus, aac, flac
+                "speed": 1.0  # 0.25 to 4.0
+            }
             response = requests.post(
                 api_url,
                 headers=headers,
                 json=payload,
+                timeout=30
             )
             if response.status_code == 200:
+                # OpenAI returns MP3 audio data
+                # Convert to format Gradio expects
+                try:
+                    # Save temporarily and load with a library that can read MP3
+                    import tempfile
+                    with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
+                        tmp_file.write(response.content)
+                        tmp_path = tmp_file.name
+                    # For now, return the file path - Gradio can handle MP3 files
+                    return tmp_path, "✅ Audio generated successfully"
+                except Exception as e:
+                    return None, f"❌ Error processing audio: {str(e)}"
             else:
                 try:
+                    error_msg = response.json().get('error', {}).get('message', 'Unknown error')
                 except:
                     error_msg = response.text if response.text else 'Unknown error'
                 return None, f"❌ API Error ({response.status_code}): {error_msg}"
                             if not last_message:
                                 return None, gr.update(visible=False), gr.update(value="⚠️ No message to read", visible=True)
+                            audio_file, status_msg = generate_tts(last_message)
+                            if audio_file:
                                 return (
+                                    audio_file,  # File path for Gradio to play
                                     gr.update(visible=True),
                                     gr.update(value=status_msg, visible=True)
                                 )
                                 info="Enable text-to-speech for assistant responses"
                             )
                             edit_tts_model = gr.Dropdown(
+                                label="TTS Model",
                                 choices=[
+                                    "openai/tts-1",
+                                    "openai/tts-1-hd"
                                 ],
+                                value=config.get('tts_model', 'openai/tts-1'),
                                 allow_custom_value=True
                             )
                             edit_tts_voice = gr.Dropdown(
                                 label="Voice",
+                                choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+                                value=config.get('tts_voice', 'alloy'),
+                                info="alloy: neutral, echo: male, fable: british male, onyx: deep male, nova: female, shimmer: female"
                             )
                         # Configuration actions
                                         DEFAULT_CONFIG['enable_dynamic_urls'],
                                         DEFAULT_CONFIG['enable_file_upload'],
                                         DEFAULT_CONFIG.get('enable_tts', False),
+                                        DEFAULT_CONFIG.get('tts_model', 'openai/tts-1'),
+                                        DEFAULT_CONFIG.get('tts_voice', 'alloy'),
                                         "✅ Reset to default configuration"
                                     )
                                 else:

config.json CHANGED Viewed

@@ -21,7 +21,7 @@
   "enable_dynamic_urls": true,
   "enable_file_upload": true,
   "enable_tts": true,
-  "tts_model": "facebook/fastspeech2-en-ljspeech",
-  "tts_voice": "default",
-  "theme": "Default"
 }

   "enable_dynamic_urls": true,
   "enable_file_upload": true,
   "enable_tts": true,
+  "tts_model": "openai/tts-1-hd",
+  "tts_voice": "onyx",
+  "theme": "Base"
 }