Spaces:

milwright
/

speech-buddy

Sleeping

App Files Files Community

milwright commited on 4 days ago

Commit

65c037b

verified ·

1 Parent(s): af54f4b

Upload 4 files

Browse files

Files changed (3) hide show

app.py +131 -158
config.json +6 -4
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -9,8 +9,16 @@ from datetime import datetime
 import urllib.parse
 from pathlib import Path
 from typing import List, Dict, Optional, Any, Tuple
-import numpy as np
-import time
 # Configuration
@@ -26,15 +34,14 @@ DEFAULT_CONFIG = {
     'max_tokens': 250,
     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
-    'theme': 'Base',
     'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
-    'enable_tts': True,
-    'tts_model': 'openai/tts-1-hd',
-    'tts_voice': 'onyx',
     'locked': False
 }
@@ -143,9 +150,6 @@ GROUNDING_URLS = config.get('grounding_urls', DEFAULT_CONFIG['grounding_urls'])
 ENABLE_DYNAMIC_URLS = config.get('enable_dynamic_urls', DEFAULT_CONFIG['enable_dynamic_urls'])
 ENABLE_FILE_UPLOAD = config.get('enable_file_upload', DEFAULT_CONFIG.get('enable_file_upload', True))
 LANGUAGE = config.get('language', DEFAULT_CONFIG.get('language', 'English'))
-ENABLE_TTS = config.get('enable_tts', DEFAULT_CONFIG.get('enable_tts', False))
-TTS_MODEL = config.get('tts_model', DEFAULT_CONFIG.get('tts_model', 'microsoft/speecht5_tts'))
-TTS_VOICE = config.get('tts_voice', DEFAULT_CONFIG.get('tts_voice', 'default'))
 # Environment variables
 ACCESS_CODE = os.environ.get("ACCESS_CODE")
@@ -290,6 +294,80 @@ def process_file_upload(file_path: str) -> str:
 _url_content_cache = {}
 def get_grounding_context() -> str:
     """Get grounding context from configured URLs with caching"""
     urls = GROUNDING_URLS
@@ -532,80 +610,6 @@ def verify_hf_token_access() -> Tuple[bool, str]:
         return False, f"Error verifying HF token: {str(e)}"
-def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
-    """Generate TTS audio using OpenAI's TTS API through OpenRouter"""
-    if not ENABLE_TTS or not text:
-        return None, "TTS disabled or no text provided"
-    api_key = os.getenv(API_KEY_VAR)
-    if not api_key:
-        return None, f"⚠️ {API_KEY_VAR} not configured for TTS"
-    # Limit text length for TTS
-    text = text[:1000]  # OpenAI supports up to 4096 chars but let's be reasonable
-    # OpenAI TTS models and voices
-    model = TTS_MODEL if TTS_MODEL.startswith("openai/") else "openai/tts-1"
-    voice = TTS_VOICE if TTS_VOICE in ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] else "alloy"
-    for attempt in range(max_retries):
-        try:
-            headers = {
-                "Authorization": f"Bearer {api_key}",
-                "HTTP-Referer": "https://huggingface.co",
-                "X-Title": SPACE_NAME,
-                "Content-Type": "application/json"
-            }
-            # OpenRouter endpoint for OpenAI TTS
-            api_url = "https://openrouter.ai/api/v1/audio/speech"
-            payload = {
-                "model": model,
-                "input": text,
-                "voice": voice,
-                "response_format": "mp3",  # Can be mp3, opus, aac, flac
-                "speed": 1.0  # 0.25 to 4.0
-            }
-            response = requests.post(
-                api_url,
-                headers=headers,
-                json=payload,
-                timeout=30
-            )
-            if response.status_code == 200:
-                # OpenAI returns MP3 audio data
-                # Convert to format Gradio expects
-                try:
-                    # Save temporarily and load with a library that can read MP3
-                    import tempfile
-                    with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
-                        tmp_file.write(response.content)
-                        tmp_path = tmp_file.name
-                    # For now, return the file path - Gradio can handle MP3 files
-                    return tmp_path, "✅ Audio generated successfully"
-                except Exception as e:
-                    return None, f"❌ Error processing audio: {str(e)}"
-            else:
-                try:
-                    error_msg = response.json().get('error', {}).get('message', 'Unknown error')
-                except:
-                    error_msg = response.text if response.text else 'Unknown error'
-                return None, f"❌ API Error ({response.status_code}): {error_msg}"
-        except requests.exceptions.Timeout:
-            return None, "⏰ TTS request timeout"
-        except Exception as e:
-            if attempt == max_retries - 1:
-                return None, f"❌ TTS Error: {str(e)}"
-            time.sleep(2)
-    return None, "❌ Max retries exceeded"
 # Create main interface with clean tab structure
 def create_interface():
@@ -679,6 +683,19 @@ def create_interface():
                             size="sm"
                         )
                     # Export handler
                     def prepare_export():
                         if not chat_history_store:
@@ -702,58 +719,44 @@ def create_interface():
                         outputs=[export_btn]
                     )
-                    # TTS functionality
-                    if ENABLE_TTS:
-                        with gr.Row():
-                            tts_btn = gr.Button("🔊 Read Last Response", variant="secondary", size="sm")
-                            audio_output = gr.Audio(label="TTS Output", visible=False, autoplay=True)
-                        tts_status = gr.Markdown("", visible=False)
-                        last_assistant_message = gr.State("")
-                        def update_last_message(chat_history):
-                            """Extract the last assistant message from chat history"""
                             if not chat_history:
-                                return ""
-                            # Find the last assistant message
-                            for message in reversed(chat_history):
-                                if isinstance(message, dict) and message.get('role') == 'assistant':
-                                    return message.get('content', '')
-                            return ""
-                        def handle_tts_click(last_message):
-                            """Handle TTS button click"""
-                            if not last_message:
-                                return None, gr.update(visible=False), gr.update(value="⚠️ No message to read", visible=True)
-                            audio_file, status_msg = generate_tts(last_message)
-                            if audio_file:
                                 return (
-                                    audio_file,  # File path for Gradio to play
-                                    gr.update(visible=True),
-                                    gr.update(value=status_msg, visible=True)
                                 )
                             else:
                                 return (
                                     None,
-                                    gr.update(visible=False),
-                                    gr.update(value=status_msg, visible=True)
                                 )
-                        # Update last message whenever chat updates
-                        chatbot.change(
-                            update_last_message,
-                            inputs=[chatbot],
-                            outputs=[last_assistant_message]
-                        )
-                        # Handle TTS button click
                         tts_btn.click(
-                            handle_tts_click,
-                            inputs=[last_assistant_message],
-                            outputs=[audio_output, audio_output, tts_status]
                         )
                     # Examples section
@@ -998,30 +1001,6 @@ def create_interface():
                                 info="Allow users to upload files for context"
                             )
-                        # TTS Configuration
-                        gr.Markdown("### 🔊 Text-to-Speech")
-                        with gr.Row():
-                            edit_enable_tts = gr.Checkbox(
-                                label="Enable TTS",
-                                value=config.get('enable_tts', False),
-                                info="Enable text-to-speech for assistant responses"
-                            )
-                            edit_tts_model = gr.Dropdown(
-                                label="TTS Model",
-                                choices=[
-                                    "openai/tts-1",
-                                    "openai/tts-1-hd"
-                                ],
-                                value=config.get('tts_model', 'openai/tts-1'),
-                                allow_custom_value=True
-                            )
-                            edit_tts_voice = gr.Dropdown(
-                                label="Voice",
-                                choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
-                                value=config.get('tts_voice', 'alloy'),
-                                info="alloy: neutral, echo: male, fable: british male, onyx: deep male, nova: female, shimmer: female"
-                            )
                         # Configuration actions
                         with gr.Row():
                             save_btn = gr.Button("💾 Save Configuration", variant="primary")
@@ -1029,7 +1008,7 @@ def create_interface():
                         config_status = gr.Markdown()
-                        def save_configuration(name, description, system_prompt, model, language, temp, tokens, examples, grounding_urls, enable_dynamic_urls, enable_file_upload, enable_tts, tts_model, tts_voice):
                             """Save updated configuration"""
                             try:
                                 updated_config = config.copy()
@@ -1045,9 +1024,6 @@ def create_interface():
                                     'grounding_urls': [url.strip() for url in grounding_urls.split('\n') if url.strip()],
                                     'enable_dynamic_urls': enable_dynamic_urls,
                                     'enable_file_upload': enable_file_upload,
-                                    'enable_tts': enable_tts,
-                                    'tts_model': tts_model,
-                                    'tts_voice': tts_voice,
                                     'locked': config.get('locked', False)
                                 })
@@ -1088,7 +1064,7 @@ def create_interface():
                             save_configuration,
                             inputs=[edit_name, edit_description, edit_system_prompt, edit_model, edit_language,
                                    edit_temperature, edit_max_tokens, edit_examples, edit_grounding_urls,
-                                   edit_enable_dynamic_urls, edit_enable_file_upload, edit_enable_tts, edit_tts_model, edit_tts_voice],
                             outputs=[config_status]
                         )
@@ -1108,21 +1084,18 @@ def create_interface():
                                         '\n'.join(DEFAULT_CONFIG['grounding_urls']),
                                         DEFAULT_CONFIG['enable_dynamic_urls'],
                                         DEFAULT_CONFIG['enable_file_upload'],
-                                        DEFAULT_CONFIG.get('enable_tts', False),
-                                        DEFAULT_CONFIG.get('tts_model', 'openai/tts-1'),
-                                        DEFAULT_CONFIG.get('tts_voice', 'alloy'),
                                         "✅ Reset to default configuration"
                                     )
                                 else:
-                                    return (*[gr.update() for _ in range(14)], "❌ Failed to reset")
                             except Exception as e:
-                                return (*[gr.update() for _ in range(14)], f"❌ Error: {str(e)}")
                         reset_btn.click(
                             reset_configuration,
                             outputs=[edit_name, edit_description, edit_system_prompt, edit_model, edit_language,
                                     edit_temperature, edit_max_tokens, edit_examples, edit_grounding_urls,
-                                    edit_enable_dynamic_urls, edit_enable_file_upload, edit_enable_tts, edit_tts_model, edit_tts_voice, config_status]
                         )
                         # Configuration tab authentication handler

 import urllib.parse
 from pathlib import Path
 from typing import List, Dict, Optional, Any, Tuple
+import base64
+import io
+# Try to import gradio_client for TTS support
+try:
+    from gradio_client import Client
+    GRADIO_CLIENT_AVAILABLE = True
+except ImportError:
+    GRADIO_CLIENT_AVAILABLE = False
+    print("Warning: gradio_client not available. TTS features will be disabled.")
 # Configuration
     'max_tokens': 250,
     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
+    'theme': 'Default',
     'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
+    'enable_tts': False,
+    'tts_spaces': ['facebook/mms-tts-eng', 'microsoft/speecht5-tts-demo'],
     'locked': False
 }
 ENABLE_DYNAMIC_URLS = config.get('enable_dynamic_urls', DEFAULT_CONFIG['enable_dynamic_urls'])
 ENABLE_FILE_UPLOAD = config.get('enable_file_upload', DEFAULT_CONFIG.get('enable_file_upload', True))
 LANGUAGE = config.get('language', DEFAULT_CONFIG.get('language', 'English'))
 # Environment variables
 ACCESS_CODE = os.environ.get("ACCESS_CODE")
 _url_content_cache = {}
+def generate_tts(text: str, hf_token: Optional[str] = None) -> Optional[Tuple[int, Any]]:
+    """
+    Generate text-to-speech audio using HuggingFace Spaces via gradio_client.
+    Uses multiple fallback options for maximum reliability.
+    Returns: Tuple of (sample_rate, audio_array) or None if failed
+    """
+    if not GRADIO_CLIENT_AVAILABLE:
+        return None
+    if not text or not text.strip():
+        return None
+    # Get HF token from environment if not provided
+    if not hf_token:
+        hf_token = os.getenv("HF_TOKEN")
+    # Get TTS spaces from config
+    tts_spaces = DEFAULT_CONFIG.get('tts_spaces', [])
+    if not tts_spaces:
+        # Default fallback spaces if none configured
+        tts_spaces = [
+            "facebook/mms-tts-eng",
+            "microsoft/speecht5-tts-demo",
+            "coqui/XTTS",
+            "myshell-ai/OpenVoice"
+        ]
+    # Limit text length for TTS
+    max_text_length = 500
+    if len(text) > max_text_length:
+        text = text[:max_text_length] + "..."
+    # Try each TTS space in order
+    for space_name in tts_spaces:
+        try:
+            print(f"Trying TTS space: {space_name}")
+            client = Client(space_name, hf_token=hf_token)
+            # Different spaces have different APIs, try common patterns
+            try:
+                # Pattern 1: Simple text input
+                result = client.predict(text, api_name="/predict")
+            except:
+                try:
+                    # Pattern 2: Text + language
+                    result = client.predict(text, "en", api_name="/predict")
+                except:
+                    try:
+                        # Pattern 3: Text + voice/speaker
+                        result = client.predict(text, "default", api_name="/predict")
+                    except:
+                        continue
+            # Handle different return types
+            if isinstance(result, str) and os.path.exists(result):
+                # Result is a file path
+                import soundfile as sf
+                audio_data, sample_rate = sf.read(result)
+                return (sample_rate, audio_data)
+            elif isinstance(result, tuple) and len(result) >= 2:
+                # Result is (sample_rate, audio_array)
+                return result
+            elif hasattr(result, 'get') and 'audio' in result:
+                # Result is a dict with audio key
+                return result['audio']
+        except Exception as e:
+            print(f"TTS failed with {space_name}: {str(e)}")
+            continue
+    return None
 def get_grounding_context() -> str:
     """Get grounding context from configured URLs with caching"""
     urls = GROUNDING_URLS
         return False, f"Error verifying HF token: {str(e)}"
 # Create main interface with clean tab structure
 def create_interface():
                             size="sm"
                         )
+                    # TTS functionality
+                    if DEFAULT_CONFIG.get('enable_tts', False) and GRADIO_CLIENT_AVAILABLE:
+                        with gr.Row():
+                            tts_btn = gr.Button("🔊 Read Last Response", variant="secondary", size="sm")
+                            tts_status = gr.Textbox(label="TTS Status", visible=False, interactive=False)
+                        audio_output = gr.Audio(
+                            label="TTS Output",
+                            visible=False,
+                            autoplay=True,
+                            type="numpy"
+                        )
                     # Export handler
                     def prepare_export():
                         if not chat_history_store:
                         outputs=[export_btn]
                     )
+                    # TTS handler
+                    if DEFAULT_CONFIG.get('enable_tts', False) and GRADIO_CLIENT_AVAILABLE:
+                        def handle_tts(chat_history):
+                            """Generate TTS for the last assistant message"""
                             if not chat_history:
+                                return None, gr.update(value="No messages to read", visible=True)
+                            # Find last assistant message
+                            last_assistant_msg = None
+                            for msg in reversed(chat_history):
+                                if msg.get("role") == "assistant":
+                                    last_assistant_msg = msg.get("content", "")
+                                    break
+                            if not last_assistant_msg:
+                                return None, gr.update(value="No assistant message found", visible=True)
+                            # Update status
+                            status_msg = "🎯 Generating audio..."
+                            # Generate TTS
+                            audio_result = generate_tts(last_assistant_msg)
+                            if audio_result:
                                 return (
+                                    gr.update(value=audio_result, visible=True),
+                                    gr.update(value="✅ Audio generated successfully", visible=True)
                                 )
                             else:
                                 return (
                                     None,
+                                    gr.update(value="❌ TTS generation failed", visible=True)
                                 )
                         tts_btn.click(
+                            handle_tts,
+                            inputs=[chatbot],
+                            outputs=[audio_output, tts_status]
                         )
                     # Examples section
                                 info="Allow users to upload files for context"
                             )
                         # Configuration actions
                         with gr.Row():
                             save_btn = gr.Button("💾 Save Configuration", variant="primary")
                         config_status = gr.Markdown()
+                        def save_configuration(name, description, system_prompt, model, language, temp, tokens, examples, grounding_urls, enable_dynamic_urls, enable_file_upload):
                             """Save updated configuration"""
                             try:
                                 updated_config = config.copy()
                                     'grounding_urls': [url.strip() for url in grounding_urls.split('\n') if url.strip()],
                                     'enable_dynamic_urls': enable_dynamic_urls,
                                     'enable_file_upload': enable_file_upload,
                                     'locked': config.get('locked', False)
                                 })
                             save_configuration,
                             inputs=[edit_name, edit_description, edit_system_prompt, edit_model, edit_language,
                                    edit_temperature, edit_max_tokens, edit_examples, edit_grounding_urls,
+                                   edit_enable_dynamic_urls, edit_enable_file_upload],
                             outputs=[config_status]
                         )
                                         '\n'.join(DEFAULT_CONFIG['grounding_urls']),
                                         DEFAULT_CONFIG['enable_dynamic_urls'],
                                         DEFAULT_CONFIG['enable_file_upload'],
                                         "✅ Reset to default configuration"
                                     )
                                 else:
+                                    return (*[gr.update() for _ in range(11)], "❌ Failed to reset")
                             except Exception as e:
+                                return (*[gr.update() for _ in range(11)], f"❌ Error: {str(e)}")
                         reset_btn.click(
                             reset_configuration,
                             outputs=[edit_name, edit_description, edit_system_prompt, edit_model, edit_language,
                                     edit_temperature, edit_max_tokens, edit_examples, edit_grounding_urls,
+                                    edit_enable_dynamic_urls, edit_enable_file_upload, config_status]
                         )
                         # Configuration tab authentication handler

config.json CHANGED Viewed

@@ -20,8 +20,10 @@
   ],
   "enable_dynamic_urls": true,
   "enable_file_upload": true,
-  "enable_tts": true,
-  "tts_model": "openai/tts-1-hd",
-  "tts_voice": "onyx",
-  "theme": "Base"
 }

   ],
   "enable_dynamic_urls": true,
   "enable_file_upload": true,
+  "enable_tts": false,
+  "tts_spaces": [
+    "facebook/mms-tts-eng",
+    "microsoft/speecht5-tts-demo"
+  ],
+  "theme": "Default"
 }

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 gradio>=5.39.0
 requests>=2.32.3
 beautifulsoup4>=4.12.3
 python-dotenv>=1.0.0
-huggingface-hub>=0.20.0
-numpy>=1.24.0

 gradio>=5.39.0
+gradio_client>=1.0.0
 requests>=2.32.3
 beautifulsoup4>=4.12.3
 python-dotenv>=1.0.0
+huggingface-hub>=0.20.0