Spaces:

milwright
/

speech-buddy

Sleeping

App Files Files Community

milwright commited on 5 days ago

Commit

aaa7d4e

verified ·

1 Parent(s): 54ce8c0

Upload 4 files

Browse files

Files changed (2) hide show

app.py +8 -26
config.json +4 -2

app.py CHANGED Viewed

@@ -27,13 +27,13 @@ DEFAULT_CONFIG = {
     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
     'theme': 'Default',
-    'grounding_urls': [],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
     'enable_tts': True,
-    'tts_model': 'microsoft/speecht5_tts',
     'tts_voice': 'default',
     'locked': False
 }
@@ -544,32 +544,14 @@ def generate_tts(text: str, max_retries: int = 2) -> Tuple[Optional[Tuple[int, n
     # Limit text length for TTS
     text = text[:500]
-    # Select speaker embedding based on voice preference
-    speaker_embeddings = None
-    if TTS_MODEL == "microsoft/speecht5_tts":
-        # For SpeechT5, we need speaker embeddings
-        # Using a predefined speaker ID (7306 is a clear female voice)
-        speaker_id = {
-            "default": 7306,
-            "female": 7306,
-            "male": 5105,
-            "neutral": 6678
-        }.get(TTS_VOICE, 7306)
-        # Note: In production, you'd load actual embeddings from the dataset
-        # For now, we'll let the API handle default voice
-        speaker_embeddings = {"speaker_id": speaker_id}
     for attempt in range(max_retries):
         try:
             headers = {"Authorization": f"Bearer {hf_token}"}
             api_url = f"https://api-inference.huggingface.co/models/{TTS_MODEL}"
-            # Prepare payload
-            payload = {"inputs": text}
-            if speaker_embeddings and TTS_MODEL == "microsoft/speecht5_tts":
-                # For models that support speaker embeddings
-                payload["parameters"] = speaker_embeddings
             response = requests.post(
                 api_url,
@@ -1012,12 +994,12 @@ def create_interface():
                             edit_tts_model = gr.Dropdown(
                                 label="TTS Model",
                                 choices=[
-                                    "microsoft/speecht5_tts",
                                     "facebook/mms-tts-eng",
-                                    "suno/bark",
-                                    "parler-tts/parler-tts-mini-v1"
                                 ],
-                                value=config.get('tts_model', 'microsoft/speecht5_tts'),
                                 allow_custom_value=True
                             )
                             edit_tts_voice = gr.Dropdown(

     'model': 'google/gemma-3-27b-it',
     'api_key_var': 'API_KEY',
     'theme': 'Default',
+    'grounding_urls': ["https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"],
     'enable_dynamic_urls': True,
     'enable_file_upload': True,
     'examples': ['Ciao! Come stai oggi?', 'Mi piace giocare a calcio. E tu?', 'Cosa mangi di solito a colazione?', 'A che ora ti svegli la mattina?', 'Qual è il tuo sport preferito?'],
     'language': 'Italian',
     'enable_tts': True,
+    'tts_model': 'facebook/fastspeech2-en-ljspeech',
     'tts_voice': 'default',
     'locked': False
 }
     # Limit text length for TTS
     text = text[:500]
+    # Prepare payload - most models just need the text
+    payload = {"inputs": text}
     for attempt in range(max_retries):
         try:
             headers = {"Authorization": f"Bearer {hf_token}"}
             api_url = f"https://api-inference.huggingface.co/models/{TTS_MODEL}"
             response = requests.post(
                 api_url,
                             edit_tts_model = gr.Dropdown(
                                 label="TTS Model",
                                 choices=[
+                                    "facebook/fastspeech2-en-ljspeech",
                                     "facebook/mms-tts-eng",
+                                    "espnet/kan-bayashi_ljspeech_vits",
+                                    "microsoft/speecht5_tts"
                                 ],
+                                value=config.get('tts_model', 'facebook/fastspeech2-en-ljspeech'),
                                 allow_custom_value=True
                             )
                             edit_tts_voice = gr.Dropdown(

config.json CHANGED Viewed

@@ -15,11 +15,13 @@
     "A che ora ti svegli la mattina?",
     "Qual \u00e8 il tuo sport preferito?"
   ],
-  "grounding_urls": [],
   "enable_dynamic_urls": true,
   "enable_file_upload": true,
   "enable_tts": true,
-  "tts_model": "microsoft/speecht5_tts",
   "tts_voice": "default",
   "theme": "Default"
 }

     "A che ora ti svegli la mattina?",
     "Qual \u00e8 il tuo sport preferito?"
   ],
+  "grounding_urls": [
+    "https://www.pnac.org/wp-content/uploads/Italian-Study-Guide.pdf"
+  ],
   "enable_dynamic_urls": true,
   "enable_file_upload": true,
   "enable_tts": true,
+  "tts_model": "facebook/fastspeech2-en-ljspeech",
   "tts_voice": "default",
   "theme": "Default"
 }