Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on May 23

Commit

280b5d0

verified ·

1 Parent(s): 385365a

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -26

app.py CHANGED Viewed

@@ -11,28 +11,31 @@ from werkzeug.utils import secure_filename
 from gtts import gTTS, lang
 from kokoro import KPipeline
-from google import genai
-from google.genai import types
-# API key setup
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
-client = genai.Client(api_key=GEMINI_API_KEY)
-# App config
 app = Flask(__name__, static_folder='static')
 CORS(app)
-# Language support
-KOKORO_LANGUAGES = {"American English": "a", "British English": "b", "Mandarin Chinese": "z",
-                    "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"}
 GTTS_LANGUAGES = lang.tts_langs()
 GTTS_LANGUAGES['ja'] = 'Japanese'
-SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))
-GEMINI_VOICE = "Kore"
 def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
     with wave.open(filename, "wb") as wf:
@@ -65,8 +68,8 @@ def translate_audio():
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
-        model = genai.GenerativeModel("gemini-2.0-flash")
         audio_blob = {
             'mime_type': audio_file.mimetype,
             'data': audio_file.read()
@@ -84,36 +87,38 @@ def translate_audio():
         # Try Gemini 2.5 TTS
         try:
-            response = client.models.generate_content(
-                model="gemini-2.5-flash-preview-tts",
                 contents=translated_text,
-                config=types.GenerateContentConfig(
-                    response_modalities=["AUDIO"],
-                    speech_config=types.SpeechConfig(
-                        voice_config=types.VoiceConfig(
-                            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=GEMINI_VOICE)
-                        )
                     )
                 )
             )
-            data = response.candidates[0].content.parts[0].inline_data.data
             temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gemini.wav")
             wave_file(temp_output_path, data)
-        except Exception:
-            # Fallback: Kokoro or gTTS
             if target_language in KOKORO_LANGUAGES:
                 lang_code = KOKORO_LANGUAGES[target_language]
                 pipeline = KPipeline(lang_code=lang_code)
                 generator = pipeline(translated_text, voice="af_heart", speed=1)
                 audio_segments = [audio for _, _, audio in generator if audio is not None]
                 if audio_segments:
                     audio_data = np.concatenate(audio_segments)
                     temp_output_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
                     sf.write(temp_output_path, audio_data, 24000)
                 else:
-                    raise ValueError("No audio generated by Kokoro")
             else:
                 lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
                 tts = gTTS(translated_text, lang=lang_code)
@@ -127,7 +132,7 @@ def translate_audio():
         })
     except Exception as e:
-        app.logger.error(f"Error: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')

 from gtts import gTTS, lang
 from kokoro import KPipeline
+import google.generativeai as genai
+from google.generativeai.types import GenerateContentConfig, SpeechConfig, VoiceConfig, PrebuiltVoiceConfig
+# Load API key
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
+genai.configure(api_key=GEMINI_API_KEY)
+# Flask app setup
 app = Flask(__name__, static_folder='static')
 CORS(app)
+# Supported languages
+KOKORO_LANGUAGES = {
+    "American English": "a", "British English": "b", "Mandarin Chinese": "z",
+    "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"
+}
 GTTS_LANGUAGES = lang.tts_langs()
 GTTS_LANGUAGES['ja'] = 'Japanese'
+SUPPORTED_LANGUAGES = sorted(set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values()))
+# Voice name for Gemini TTS
+DEFAULT_GEMINI_VOICE = "Kore"
 def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
     with wave.open(filename, "wb") as wf:
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
+        # Transcribe audio with Gemini
+        model = genai.GenerativeModel("models/gemini-1.5-flash")
         audio_blob = {
             'mime_type': audio_file.mimetype,
             'data': audio_file.read()
         # Try Gemini 2.5 TTS
         try:
+            tts_response = genai.generate_content(
+                model="models/gemini-2.5-flash-preview-tts",
                 contents=translated_text,
+                generation_config=GenerateContentConfig(
+                    response_mime_type="audio/wav"
+                ),
+                speech_config=SpeechConfig(
+                    voice_config=VoiceConfig(
+                        prebuilt_voice=PrebuiltVoiceConfig(voice_name=DEFAULT_GEMINI_VOICE)
                     )
                 )
             )
+            data = tts_response.candidates[0].content.parts[0].inline_data.data
             temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gemini.wav")
             wave_file(temp_output_path, data)
+        except Exception as gemini_tts_error:
+            app.logger.warning(f"Gemini TTS failed: {gemini_tts_error}")
+            # Fallback to Kokoro or gTTS
             if target_language in KOKORO_LANGUAGES:
                 lang_code = KOKORO_LANGUAGES[target_language]
                 pipeline = KPipeline(lang_code=lang_code)
                 generator = pipeline(translated_text, voice="af_heart", speed=1)
                 audio_segments = [audio for _, _, audio in generator if audio is not None]
                 if audio_segments:
                     audio_data = np.concatenate(audio_segments)
                     temp_output_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
                     sf.write(temp_output_path, audio_data, 24000)
                 else:
+                    raise ValueError("No audio generated by Kokoro.")
             else:
                 lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
                 tts = gTTS(translated_text, lang=lang_code)
         })
     except Exception as e:
+        app.logger.error(f"Processing error: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')