Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on 14 days ago

Commit

9e7d27b

verified ·

1 Parent(s): e51d62b

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -150

app.py CHANGED Viewed

@@ -1,186 +1,152 @@
 import os
 import tempfile
-import wave
 import numpy as np
 import soundfile as sf
 from flask import Flask, request, jsonify, send_file, send_from_directory
 from flask_cors import CORS
-from werkzeug.utils import secure_filename
-from gtts import gTTS, lang
-from kokoro import KPipeline
 import google.generativeai as genai
-from google.generativeai.types import (
-    GenerateContentConfig,
-    SpeechConfig,
-    VoiceConfig,
-    PrebuiltVoiceConfig,
-)
-# -----------------------------------------------------------------------------
-#  Configuration
-# -----------------------------------------------------------------------------
-# 1) Make sure you've run:
-#      pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
-#
-# 2) Set your Gemini API key in the environment:
-#      export GEMINI_API_KEY="your_real_api_key_here"
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
-    raise RuntimeError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
-client = genai.Client(api_key=GEMINI_API_KEY)
-# Kokoro and gTTS language maps
-KOKORO_LANGUAGES = {
-    "American English": "a",
-    "British English": "b",
-    "Mandarin Chinese": "z",
-    "Spanish": "e",
-    "French": "f",
-    "Hindi": "h",
-    "Italian": "i",
-    "Brazilian Portuguese": "p",
-}
-GTTS_LANGUAGES = lang.tts_langs()
-GTTS_LANGUAGES["ja"] = "Japanese"  # explicit Japanese support
-SUPPORTED_LANGUAGES = sorted(
-    set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values())
-)
-# Voice name for Gemini TTS preview
-GEMINI_VOICE_NAME = "Kore"
-# -----------------------------------------------------------------------------
-#  Helpers
-# -----------------------------------------------------------------------------
-def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
-    """Write raw PCM into a .wav file."""
-    with wave.open(filename, "wb") as wf:
-        wf.setnchannels(channels)
-        wf.setsampwidth(sample_width)
-        wf.setframerate(rate)
-        wf.writeframes(pcm)
-# -----------------------------------------------------------------------------
-#  Flask App
-# -----------------------------------------------------------------------------
-app = Flask(__name__, static_folder="static")
-CORS(app)
-@app.route("/")
 def serve_index():
-    # serve your index.html from ./static/index.html
-    return send_from_directory(app.static_folder, "index.html")
-@app.route("/languages")
-def list_languages():
-    return jsonify(SUPPORTED_LANGUAGES)
-@app.route("/translate", methods=["POST"])
 def translate_audio():
     try:
-        # 1. Receive file + target language
-        if "audio" not in request.files:
-            return jsonify(error="No audio file uploaded"), 400
-        audio_file = request.files["audio"]
-        target_lang = request.form.get("language", "English")
-        if not audio_file or audio_file.filename == "":
-            return jsonify(error="Invalid audio file"), 400
-        # 2. Validate MIME type
-        if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
-            return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400
-        # 3. Transcribe with Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
-        blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}
         convo = model.start_chat()
-        convo.send_message(
-            "You are a professional transcriber. Transcribe this audio accurately, verbatim."
-        )
-        resp = convo.send_message(blob)
-        transcription = resp.text.strip()
-        # 4. Translate with Gemini
-        prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
-        translation_resp = model.generate_content(prompt)
-        translated_text = translation_resp.text.strip()
-        # 5. Try Gemini TTS 2.5 preview
-        try:
-            tts_resp = client.models.generate_content(
-                model="gemini-2.5-flash-preview-tts",
-                contents=translated_text,
-                config=GenerateContentConfig(
-                    response_modalities=["AUDIO"],
-                    speech_config=SpeechConfig(
-                        voice_config=VoiceConfig(
-                            prebuilt_voice_config=PrebuiltVoiceConfig(
-                                voice_name=GEMINI_VOICE_NAME
-                            )
                         )
-                    ),
                 ),
             )
-            pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
-            out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
-            wave_file(out_path, pcm_data)
-        except Exception:
-            # Fallback: Kokoro
-            if target_lang in KOKORO_LANGUAGES:
-                code = KOKORO_LANGUAGES[target_lang]
-                pipeline = KPipeline(lang_code=code)
-                generator = pipeline(translated_text, voice="af_heart", speed=1)
-                segments = [audio for _, _, audio in generator if audio is not None]
-                if segments:
-                    arr = np.concatenate(segments)
-                    out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
-                    sf.write(out_path, arr, 24000)
-                else:
-                    raise RuntimeError("Kokoro produced no audio")
-            # Final fallback: gTTS
-            else:
-                gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
-                tts = gTTS(translated_text, lang=gtts_code)
-                out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
-                tts.save(out_path)
-        return jsonify(
-            transcription=transcription,
-            translation=translated_text,
-            audio_url=f"/download/{os.path.basename(out_path)}",
         )
-    except Exception as e:
-        app.logger.exception("Error in /translate")
-        return jsonify(error=str(e)), 500
-@app.route("/download/<filename>")
-def download_file(filename):
-    path = os.path.join(tempfile.gettempdir(), filename)
-    if not os.path.isfile(path):
-        return jsonify(error="File not found"), 404
-    return send_file(path, as_attachment=True, download_name=f"translated_{filename}")
-if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 import os
 import tempfile
 import numpy as np
 import soundfile as sf
+import wave
 from flask import Flask, request, jsonify, send_file, send_from_directory
 from flask_cors import CORS
 import google.generativeai as genai
+from google.generativeai import types
+# Initialize Flask app
+app = Flask(__name__, static_folder='static')
+CORS(app)
+# Configure Gemini API
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
+    raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
+# Supported languages and their BCP-47 codes
+SUPPORTED_LANGUAGES = {
+    "Arabic (Egyptian)": "ar-EG",
+    "German (Germany)": "de-DE",
+    "English (US)": "en-US",
+    "Spanish (US)": "es-US",
+    "French (France)": "fr-FR",
+    "Hindi (India)": "hi-IN",
+    "Indonesian (Indonesia)": "id-ID",
+    "Italian (Italy)": "it-IT",
+    "Japanese (Japan)": "ja-JP",
+    "Korean (Korea)": "ko-KR",
+    "Portuguese (Brazil)": "pt-BR",
+    "Russian (Russia)": "ru-RU",
+    "Dutch (Netherlands)": "nl-NL",
+    "Polish (Poland)": "pl-PL",
+    "Thai (Thailand)": "th-TH",
+    "Turkish (Turkey)": "tr-TR",
+    "Vietnamese (Vietnam)": "vi-VN",
+    "Romanian (Romania)": "ro-RO",
+    "Ukrainian (Ukraine)": "uk-UA",
+    "Bengali (Bangladesh)": "bn-BD",
+    "English (India)": "en-IN",
+    "Marathi (India)": "mr-IN",
+    "Tamil (India)": "ta-IN",
+    "Telugu (India)": "te-IN"
+}
+@app.route('/')
 def serve_index():
+    return send_from_directory(app.static_folder, 'index.html')
+@app.route('/languages')
+def get_languages():
+    return jsonify(list(SUPPORTED_LANGUAGES.keys()))
+@app.route('/translate', methods=['POST'])
 def translate_audio():
     try:
+        if 'audio' not in request.files:
+            return jsonify({'error': 'No audio file uploaded'}), 400
+        audio_file = request.files['audio']
+        target_language = request.form.get('language', 'English (US)')
+        if not audio_file or audio_file.filename == '':
+            return jsonify({'error': 'Invalid audio file'}), 400
+        # Validate MIME type
+        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
+        if audio_file.mimetype not in allowed_mime_types:
+            return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
+        # Read audio data
+        audio_data = audio_file.read()
+        # Transcribe audio using Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
+        audio_blob = {
+            'mime_type': audio_file.mimetype,
+            'data': audio_data
+        }
         convo = model.start_chat()
+        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
+        response = convo.send_message(audio_blob)
+        transcription = response.text.strip()
+        # Translate text using Gemini
+        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
+        response = model.generate_content(prompt)
+        translated_text = response.text.strip()
+        # Generate TTS using Gemini
+        # Initialize Gemini client
+        client = genai.Client(api_key=GEMINI_API_KEY)
+        # Determine language code
+        lang_code = SUPPORTED_LANGUAGES.get(target_language, 'en-US')
+        # Generate speech
+        response = client.models.generate_content(
+            model="gemini-2.5-flash-preview-tts",
+            contents=translated_text,
+            config=types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                            voice_name='Kore'  # You can change the voice as needed
                         )
+                    )
                 ),
             )
         )
+        # Extract audio data
+        audio_output = response.candidates[0].content.parts[0].inline_data.data
+        # Save audio to temporary file
+        temp_fd, temp_output_path = tempfile.mkstemp(suffix=".wav")
+        with wave.open(temp_output_path, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(24000)
+            wf.writeframes(audio_output)
+        return jsonify({
+            'transcription': transcription,
+            'translation': translated_text,
+            'audio_url': f'/download/{os.path.basename(temp_output_path)}'
+        })
+    except Exception as e:
+        app.logger.error(f"Error processing request: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/download/<filename>')
+def download_file(filename):
+    try:
+        return send_file(
+            os.path.join(tempfile.gettempdir(), filename),
+            mimetype="audio/wav",
+            as_attachment=True,
+            download_name=f"translated_{filename}"
+        )
+    except FileNotFoundError:
+        return jsonify({'error': 'File not found'}), 404
+if __name__ == '__main__':
     app.run(host="0.0.0.0", port=7860)