Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on May 23

Commit

d7f2b05

verified ·

1 Parent(s): 9e7d27b

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -78

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import os
-import tempfile
 import numpy as np
-import soundfile as sf
-import wave
 from flask import Flask, request, jsonify, send_file, send_from_directory
-from flask_cors import CORS
 import google.generativeai as genai
-from google.generativeai import types
-# Initialize Flask app
 app = Flask(__name__, static_folder='static')
 CORS(app)
@@ -19,51 +18,42 @@ if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
-# Supported languages and their BCP-47 codes
-SUPPORTED_LANGUAGES = {
-    "Arabic (Egyptian)": "ar-EG",
-    "German (Germany)": "de-DE",
-    "English (US)": "en-US",
-    "Spanish (US)": "es-US",
-    "French (France)": "fr-FR",
-    "Hindi (India)": "hi-IN",
-    "Indonesian (Indonesia)": "id-ID",
-    "Italian (Italy)": "it-IT",
-    "Japanese (Japan)": "ja-JP",
-    "Korean (Korea)": "ko-KR",
-    "Portuguese (Brazil)": "pt-BR",
-    "Russian (Russia)": "ru-RU",
-    "Dutch (Netherlands)": "nl-NL",
-    "Polish (Poland)": "pl-PL",
-    "Thai (Thailand)": "th-TH",
-    "Turkish (Turkey)": "tr-TR",
-    "Vietnamese (Vietnam)": "vi-VN",
-    "Romanian (Romania)": "ro-RO",
-    "Ukrainian (Ukraine)": "uk-UA",
-    "Bengali (Bangladesh)": "bn-BD",
-    "English (India)": "en-IN",
-    "Marathi (India)": "mr-IN",
-    "Tamil (India)": "ta-IN",
-    "Telugu (India)": "te-IN"
 }
 @app.route('/')
 def serve_index():
     return send_from_directory(app.static_folder, 'index.html')
 @app.route('/languages')
 def get_languages():
-    return jsonify(list(SUPPORTED_LANGUAGES.keys()))
 @app.route('/translate', methods=['POST'])
 def translate_audio():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
-        target_language = request.form.get('language', 'English (US)')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
@@ -72,16 +62,16 @@ def translate_audio():
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
-        # Read audio data
-        audio_data = audio_file.read()
         # Transcribe audio using Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
         audio_blob = {
             'mime_type': audio_file.mimetype,
-            'data': audio_data
         }
         convo = model.start_chat()
         convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
         response = convo.send_message(audio_blob)
@@ -91,47 +81,38 @@ def translate_audio():
         prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
         response = model.generate_content(prompt)
         translated_text = response.text.strip()
-        # Generate TTS using Gemini
-        # Initialize Gemini client
-        client = genai.Client(api_key=GEMINI_API_KEY)
-        # Determine language code
-        lang_code = SUPPORTED_LANGUAGES.get(target_language, 'en-US')
-        # Generate speech
-        response = client.models.generate_content(
-            model="gemini-2.5-flash-preview-tts",
-            contents=translated_text,
-            config=types.GenerateContentConfig(
-                response_modalities=["AUDIO"],
-                speech_config=types.SpeechConfig(
-                    voice_config=types.VoiceConfig(
-                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                            voice_name='Kore'  # You can change the voice as needed
-                        )
-                    )
-                ),
-            )
-        )
-        # Extract audio data
-        audio_output = response.candidates[0].content.parts[0].inline_data.data
-        # Save audio to temporary file
-        temp_fd, temp_output_path = tempfile.mkstemp(suffix=".wav")
-        with wave.open(temp_output_path, "wb") as wf:
-            wf.setnchannels(1)
-            wf.setsampwidth(2)
-            wf.setframerate(24000)
-            wf.writeframes(audio_output)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
             'audio_url': f'/download/{os.path.basename(temp_output_path)}'
         })
     except Exception as e:
         app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
@@ -141,7 +122,7 @@ def download_file(filename):
     try:
         return send_file(
             os.path.join(tempfile.gettempdir(), filename),
-            mimetype="audio/wav",
             as_attachment=True,
             download_name=f"translated_{filename}"
         )

 import os
 import numpy as np
 from flask import Flask, request, jsonify, send_file, send_from_directory
 import google.generativeai as genai
+from gtts import gTTS, lang
+import tempfile
+import soundfile as sf
+from kokoro import KPipeline
+from werkzeug.utils import secure_filename
+from flask_cors import CORS
 app = Flask(__name__, static_folder='static')
 CORS(app)
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
+# Language configurations
+KOKORO_LANGUAGES = {
+    "American English": "a",
+    "British English": "b",
+    "Mandarin Chinese": "z",
+    "Spanish": "e",
+    "French": "f",
+    "Hindi": "h",
+    "Italian": "i",
+    "Brazilian Portuguese": "p"
 }
+GTTS_LANGUAGES = lang.tts_langs()
+GTTS_LANGUAGES['ja'] = 'Japanese'  # Explicit Japanese support
+SUPPORTED_LANGUAGES = sorted(
+    list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
+)
 @app.route('/')
 def serve_index():
     return send_from_directory(app.static_folder, 'index.html')
 @app.route('/languages')
 def get_languages():
+    return jsonify(SUPPORTED_LANGUAGES)
 @app.route('/translate', methods=['POST'])
 def translate_audio():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
+        target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
         # Transcribe audio using Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
+        # Create proper audio blob
         audio_blob = {
             'mime_type': audio_file.mimetype,
+            'data': audio_file.read()
         }
+        # Get transcription
         convo = model.start_chat()
         convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
         response = convo.send_message(audio_blob)
         prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
         response = model.generate_content(prompt)
         translated_text = response.text.strip()
+        # Generate TTS
+        if target_language in KOKORO_LANGUAGES:
+            lang_code = KOKORO_LANGUAGES[target_language]
+            pipeline = KPipeline(lang_code=lang_code)
+            generator = pipeline(translated_text, voice="af_heart", speed=1)
+            # Collect all audio segments
+            audio_segments = []
+            for _, _, audio in generator:
+                if audio is not None:
+                    audio_segments.append(audio)
+            if audio_segments:
+                audio_data = np.concatenate(audio_segments)
+                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
+                sf.write(temp_output_path, audio_data, 24000)
+            else:
+                raise ValueError("No audio generated by Kokoro")
+        else:
+            # Standard gTTS handling
+            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
+            tts = gTTS(translated_text, lang=lang_code)
+            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
+            tts.save(temp_output_path)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
             'audio_url': f'/download/{os.path.basename(temp_output_path)}'
         })
     except Exception as e:
         app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
     try:
         return send_file(
             os.path.join(tempfile.gettempdir(), filename),
+            mimetype="audio/mpeg",
             as_attachment=True,
             download_name=f"translated_{filename}"
         )