Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on 19 days ago

Commit

385365a

verified ·

1 Parent(s): f49c906

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -64

app.py CHANGED Viewed

@@ -1,41 +1,45 @@
 import os
 import numpy as np
 from flask import Flask, request, jsonify, send_file, send_from_directory
-import google.generativeai as genai
 from gtts import gTTS, lang
-import tempfile
-import soundfile as sf
 from kokoro import KPipeline
-from werkzeug.utils import secure_filename
-from flask_cors import CORS
-app = Flask(__name__, static_folder='static')
-CORS(app)
-# Configure Gemini API
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
-genai.configure(api_key=GEMINI_API_KEY)
-# Language configurations
-KOKORO_LANGUAGES = {
-    "American English": "a",
-    "British English": "b",
-    "Mandarin Chinese": "z",
-    "Spanish": "e",
-    "French": "f",
-    "Hindi": "h",
-    "Italian": "i",
-    "Brazilian Portuguese": "p"
-}
 GTTS_LANGUAGES = lang.tts_langs()
-GTTS_LANGUAGES['ja'] = 'Japanese'  # Explicit Japanese support
-SUPPORTED_LANGUAGES = sorted(
-    list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
-)
 @app.route('/')
 def serve_index():
@@ -50,71 +54,80 @@ def translate_audio():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
         target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
-        # Validate MIME type
         allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
-        # Transcribe audio using Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
-        # Create proper audio blob
         audio_blob = {
             'mime_type': audio_file.mimetype,
             'data': audio_file.read()
         }
-        # Get transcription
         convo = model.start_chat()
-        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
         response = convo.send_message(audio_blob)
         transcription = response.text.strip()
-        # Translate text using Gemini
-        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
-        response = model.generate_content(prompt)
-        translated_text = response.text.strip()
-        # Generate TTS
-        if target_language in KOKORO_LANGUAGES:
-            lang_code = KOKORO_LANGUAGES[target_language]
-            pipeline = KPipeline(lang_code=lang_code)
-            generator = pipeline(translated_text, voice="af_heart", speed=1)
-            # Collect all audio segments
-            audio_segments = []
-            for _, _, audio in generator:
-                if audio is not None:
-                    audio_segments.append(audio)
-            if audio_segments:
-                audio_data = np.concatenate(audio_segments)
-                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
-                sf.write(temp_output_path, audio_data, 24000)
             else:
-                raise ValueError("No audio generated by Kokoro")
-        else:
-            # Standard gTTS handling
-            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
-            tts = gTTS(translated_text, lang=lang_code)
-            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
-            tts.save(temp_output_path)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
             'audio_url': f'/download/{os.path.basename(temp_output_path)}'
         })
     except Exception as e:
-        app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')

 import os
+import tempfile
+import wave
 import numpy as np
+import soundfile as sf
 from flask import Flask, request, jsonify, send_file, send_from_directory
+from flask_cors import CORS
+from werkzeug.utils import secure_filename
 from gtts import gTTS, lang
 from kokoro import KPipeline
+from google import genai
+from google.genai import types
+# API key setup
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
+client = genai.Client(api_key=GEMINI_API_KEY)
+# App config
+app = Flask(__name__, static_folder='static')
+CORS(app)
+# Language support
+KOKORO_LANGUAGES = {"American English": "a", "British English": "b", "Mandarin Chinese": "z",
+                    "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"}
 GTTS_LANGUAGES = lang.tts_langs()
+GTTS_LANGUAGES['ja'] = 'Japanese'
+SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))
+GEMINI_VOICE = "Kore"
+def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
+    with wave.open(filename, "wb") as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(sample_width)
+        wf.setframerate(rate)
+        wf.writeframes(pcm)
 @app.route('/')
 def serve_index():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
         target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
         allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
         model = genai.GenerativeModel("gemini-2.0-flash")
         audio_blob = {
             'mime_type': audio_file.mimetype,
             'data': audio_file.read()
         }
         convo = model.start_chat()
+        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim.")
         response = convo.send_message(audio_blob)
         transcription = response.text.strip()
+        # Translate
+        prompt = f"Translate the following text to {target_language}:\n\n{transcription}"
+        translation_response = model.generate_content(prompt)
+        translated_text = translation_response.text.strip()
+        # Try Gemini 2.5 TTS
+        try:
+            response = client.models.generate_content(
+                model="gemini-2.5-flash-preview-tts",
+                contents=translated_text,
+                config=types.GenerateContentConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=types.SpeechConfig(
+                        voice_config=types.VoiceConfig(
+                            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=GEMINI_VOICE)
+                        )
+                    )
+                )
+            )
+            data = response.candidates[0].content.parts[0].inline_data.data
+            temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gemini.wav")
+            wave_file(temp_output_path, data)
+        except Exception:
+            # Fallback: Kokoro or gTTS
+            if target_language in KOKORO_LANGUAGES:
+                lang_code = KOKORO_LANGUAGES[target_language]
+                pipeline = KPipeline(lang_code=lang_code)
+                generator = pipeline(translated_text, voice="af_heart", speed=1)
+                audio_segments = [audio for _, _, audio in generator if audio is not None]
+                if audio_segments:
+                    audio_data = np.concatenate(audio_segments)
+                    temp_output_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
+                    sf.write(temp_output_path, audio_data, 24000)
+                else:
+                    raise ValueError("No audio generated by Kokoro")
             else:
+                lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
+                tts = gTTS(translated_text, lang=lang_code)
+                temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
+                tts.save(temp_output_path)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
             'audio_url': f'/download/{os.path.basename(temp_output_path)}'
         })
     except Exception as e:
+        app.logger.error(f"Error: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')