Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on 27 days ago

Commit

bfc5175

verified ·

1 Parent(s): 9ffbfd1

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -47

app.py CHANGED Viewed

@@ -1,122 +1,145 @@
 import os
 import numpy as np
-from flask import Flask, request, jsonify, send_file, send_from_directory
-import google.generativeai as genai
-from gtts import gTTS, lang
 import tempfile
 import soundfile as sf
-from kokoro import KPipeline
-from werkzeug.utils import secure_filename
 from flask_cors import CORS
 app = Flask(__name__, static_folder='static')
 CORS(app)
-# Configure Gemini API
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
-# Language configurations
 KOKORO_LANGUAGES = {
-    "American English": "a",
-    "British English": "b",
-    "Mandarin Chinese": "z",
-    "Spanish": "e",
-    "French": "f",
-    "Hindi": "h",
-    "Italian": "i",
-    "Brazilian Portuguese": "p"
 }
 GTTS_LANGUAGES = lang.tts_langs()
-GTTS_LANGUAGES['ja'] = 'Japanese'  # Explicit Japanese support
-SUPPORTED_LANGUAGES = sorted(
-    list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
-)
 @app.route('/')
 def serve_index():
     return send_from_directory(app.static_folder, 'index.html')
 @app.route('/languages')
 def get_languages():
     return jsonify(SUPPORTED_LANGUAGES)
 @app.route('/translate', methods=['POST'])
 def translate_audio():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
         target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
-        # Validate MIME type
         allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
-        # Transcribe audio using Gemini
         model = genai.GenerativeModel("gemini-2.0-flash")
-        # Create proper audio blob
-        audio_blob = {
-            'mime_type': audio_file.mimetype,
-            'data': audio_file.read()
-        }
-        # Get transcription
         convo = model.start_chat()
         convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
         response = convo.send_message(audio_blob)
         transcription = response.text.strip()
-        # Translate text using Gemini
         prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
         response = model.generate_content(prompt)
         translated_text = response.text.strip()
-        # Generate TTS
-        if target_language in KOKORO_LANGUAGES:
             lang_code = KOKORO_LANGUAGES[target_language]
             pipeline = KPipeline(lang_code=lang_code)
             generator = pipeline(translated_text, voice="af_heart", speed=1)
-            # Collect all audio segments
             audio_segments = []
             for _, _, audio in generator:
                 if audio is not None:
                     audio_segments.append(audio)
             if audio_segments:
                 audio_data = np.concatenate(audio_segments)
-                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
-                sf.write(temp_output_path, audio_data, 24000)
             else:
                 raise ValueError("No audio generated by Kokoro")
         else:
-            # Standard gTTS handling
             lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
             tts = gTTS(translated_text, lang=lang_code)
-            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
-            tts.save(temp_output_path)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
-            'audio_url': f'/download/{os.path.basename(temp_output_path)}'
         })
     except Exception as e:
         app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')
 def download_file(filename):
     try:
@@ -129,5 +152,17 @@ def download_file(filename):
     except FileNotFoundError:
         return jsonify({'error': 'File not found'}), 404
 if __name__ == '__main__':
     app.run(host="0.0.0.0", port=7860)

 import os
 import numpy as np
 import tempfile
 import soundfile as sf
+import wave
+from flask import Flask, request, jsonify, send_file, send_from_directory
 from flask_cors import CORS
+from werkzeug.utils import secure_filename
+from kokoro import KPipeline
+from gtts import gTTS, lang
+from google import generativeai as genai
+from google.genai import types
+# Flask app setup
 app = Flask(__name__, static_folder='static')
 CORS(app)
+# Gemini API configuration
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
+client = genai.Client(api_key=GEMINI_API_KEY)
+# Language support
 KOKORO_LANGUAGES = {
+    "American English": "a", "British English": "b", "Mandarin Chinese": "z",
+    "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"
 }
 GTTS_LANGUAGES = lang.tts_langs()
+GTTS_LANGUAGES['ja'] = 'Japanese'
+SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))
+GEMINI_VOICES = {
+    "ar-EG": "Kore", "de-DE": "Kore", "en-US": "Kore", "es-US": "Kore", "fr-FR": "Kore",
+    "hi-IN": "Kore", "id-ID": "Kore", "it-IT": "Kore", "ja-JP": "Kore", "ko-KR": "Kore",
+    "pt-BR": "Kore", "ru-RU": "Kore", "nl-NL": "Kore", "pl-PL": "Kore", "th-TH": "Kore",
+    "tr-TR": "Kore", "vi-VN": "Kore", "ro-RO": "Kore", "uk-UA": "Kore", "bn-BD": "Kore",
+    "en-IN": "Kore", "mr-IN": "Kore", "ta-IN": "Kore", "te-IN": "Kore"
+}
+def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
+    with wave.open(filename, "wb") as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(sample_width)
+        wf.setframerate(rate)
+        wf.writeframes(pcm)
 @app.route('/')
 def serve_index():
     return send_from_directory(app.static_folder, 'index.html')
 @app.route('/languages')
 def get_languages():
     return jsonify(SUPPORTED_LANGUAGES)
 @app.route('/translate', methods=['POST'])
 def translate_audio():
     try:
         if 'audio' not in request.files:
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
         target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
         allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
         if audio_file.mimetype not in allowed_mime_types:
             return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
         model = genai.GenerativeModel("gemini-2.0-flash")
+        audio_blob = {'mime_type': audio_file.mimetype, 'data': audio_file.read()}
         convo = model.start_chat()
         convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
         response = convo.send_message(audio_blob)
         transcription = response.text.strip()
         prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
         response = model.generate_content(prompt)
         translated_text = response.text.strip()
+        voice_name = GEMINI_VOICES.get(get_bcp47_code(target_language), None)
+        if voice_name:
+            response = client.models.generate_content(
+                model="gemini-2.5-flash-preview-tts",
+                contents=translated_text,
+                config=types.GenerateContentConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=types.SpeechConfig(
+                        voice_config=types.VoiceConfig(
+                            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
+                        )
+                    )
+                )
+            )
+            data = response.candidates[0].content.parts[0].inline_data.data
+            temp_path = os.path.join(tempfile.gettempdir(), f'tts_{secure_filename(audio_file.filename)}.wav')
+            wave_file(temp_path, data)
+        elif target_language in KOKORO_LANGUAGES:
             lang_code = KOKORO_LANGUAGES[target_language]
             pipeline = KPipeline(lang_code=lang_code)
             generator = pipeline(translated_text, voice="af_heart", speed=1)
             audio_segments = []
             for _, _, audio in generator:
                 if audio is not None:
                     audio_segments.append(audio)
             if audio_segments:
                 audio_data = np.concatenate(audio_segments)
+                temp_path = os.path.join(tempfile.gettempdir(), f'kokoro_{secure_filename(audio_file.filename)}.wav')
+                sf.write(temp_path, audio_data, 24000)
             else:
                 raise ValueError("No audio generated by Kokoro")
         else:
             lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
             tts = gTTS(translated_text, lang=lang_code)
+            temp_path = os.path.join(tempfile.gettempdir(), f'gtts_{secure_filename(audio_file.filename)}.mp3')
+            tts.save(temp_path)
         return jsonify({
             'transcription': transcription,
             'translation': translated_text,
+            'audio_url': f'/download/{os.path.basename(temp_path)}'
         })
     except Exception as e:
         app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')
 def download_file(filename):
     try:
     except FileNotFoundError:
         return jsonify({'error': 'File not found'}), 404
+def get_bcp47_code(language):
+    bcp_map = {
+        "Arabic": "ar-EG", "German": "de-DE", "English": "en-US", "Spanish": "es-US", "French": "fr-FR",
+        "Hindi": "hi-IN", "Indonesian": "id-ID", "Italian": "it-IT", "Japanese": "ja-JP", "Korean": "ko-KR",
+        "Portuguese": "pt-BR", "Russian": "ru-RU", "Dutch": "nl-NL", "Polish": "pl-PL", "Thai": "th-TH",
+        "Turkish": "tr-TR", "Vietnamese": "vi-VN", "Romanian": "ro-RO", "Ukrainian": "uk-UA", "Bengali": "bn-BD",
+        "Indian English": "en-IN", "Marathi": "mr-IN", "Tamil": "ta-IN", "Telugu": "te-IN"
+    }
+    return bcp_map.get(language)
 if __name__ == '__main__':
     app.run(host="0.0.0.0", port=7860)