Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on May 23

Commit

9ffbfd1

verified ·

1 Parent(s): 132c026

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -105

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import os
-# import numpy as np # No longer needed for TTS
 from flask import Flask, request, jsonify, send_file, send_from_directory
 import google.generativeai as genai
-from google.generativeai import types as genai_types # For clarity if needed, or use genai.types
-# from gtts import gTTS, lang # Removed
 import tempfile
-# import soundfile as sf # Removed, using wave module instead
-# from kokoro import KPipeline # Removed
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
-import wave # Added for saving WAV files
 app = Flask(__name__, static_folder='static')
 CORS(app)
@@ -20,53 +18,24 @@ if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
-# Transcription and Translation Model
-TRANSCRIPTION_TRANSLATION_MODEL_NAME = "gemini-2.0-flash" # Using 1.5 flash as it's common, was "gemini-2.0-flash"
-# Text-to-Speech Model
-TTS_MODEL_NAME = "gemini-2.5-flash-preview-tts" # Using a model known to support audio output modality.
-                                      # The user's example mentioned "gemini-2.5-flash-preview-tts".
-                                      # If that specific model works with response_mime_type, it can be used.
-# Gemini TTS Supported Languages (Display Name: BCP-47 Code)
-# Based on the user-provided list. The TTS API auto-detects language from text.
-# This list is primarily for the frontend language selector.
-GEMINI_TTS_LANGUAGES = {
-    "Arabic (Egyptian)": "ar-EG",
-    "German (Germany)": "de-DE",
-    "English (US)": "en-US",
-    "Spanish (US)": "es-US",
-    "French (France)": "fr-FR",
-    "Hindi (India)": "hi-IN",
-    "Indonesian (Indonesia)": "id-ID",
-    "Italian (Italy)": "it-IT",
-    "Japanese (Japan)": "ja-JP",
-    "Korean (Korea)": "ko-KR",
-    "Portuguese (Brazil)": "pt-BR",
-    "Russian (Russia)": "ru-RU",
-    "Dutch (Netherlands)": "nl-NL",
-    "Polish (Poland)": "pl-PL",
-    "Thai (Thailand)": "th-TH",
-    "Turkish (Turkey)": "tr-TR",
-    "Vietnamese (Vietnam)": "vi-VN",
-    "Romanian (Romania)": "ro-RO",
-    "Ukrainian (Ukraine)": "uk-UA",
-    "Bengali (Bangladesh)": "bn-BD",
-    "English (India)": "en-IN",
-    "Marathi (India)": "mr-IN",
-    "Tamil (India)": "ta-IN",
-    "Telugu (India)": "te-IN"
 }
-SUPPORTED_LANGUAGES = sorted(list(GEMINI_TTS_LANGUAGES.keys()))
-# Helper function to save PCM data as a WAV file
-def save_wave_file(filename, pcm_data, channels=1, sample_width=2, frame_rate=24000):
-    """Saves PCM audio data to a WAV file."""
-    with wave.open(filename, "wb") as wf:
-        wf.setnchannels(channels)
-        wf.setsampwidth(sample_width)  # Bytes per sample
-        wf.setframerate(frame_rate)
-        wf.writeframes(pcm_data)
 @app.route('/')
 def serve_index():
@@ -83,63 +52,60 @@ def translate_audio():
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
-        target_language_display_name = request.form.get('language', 'English (US)') # Default to a common one
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
-        # Validate MIME type for transcription
-        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp3', 'audio/ogg', 'audio/flac', 'audio/mp4', 'audio/webm', 'audio/amr']
         if audio_file.mimetype not in allowed_mime_types:
-            return jsonify({'error': f'Unsupported file type for transcription: {audio_file.mimetype}'}), 400
-        # Initialize Gemini model for transcription and translation
-        model = genai.GenerativeModel(TRANSCRIPTION_TRANSLATION_MODEL_NAME)
-        audio_data_bytes = audio_file.read()
-        audio_blob = genai_types.Blob(mime_type=audio_file.mimetype, data=audio_data_bytes)
         # Get transcription
-        # Forcing transcription to be in original language can be tricky if the model tends to translate.
-        # A more robust prompt might be needed if issues arise.
-        transcription_prompt = "You are a professional transcriber. Transcribe this audio accurately and verbatim in its original spoken language. Respond only with the transcription."
-        # Using genai.upload_file for larger files if needed, but for direct blob:
-        response = model.generate_content([transcription_prompt, audio_blob])
         transcription = response.text.strip()
         # Translate text using Gemini
-        translation_prompt = f"Translate the following text to {target_language_display_name}. Preserve meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
-        response = model.generate_content(translation_prompt)
         translated_text = response.text.strip()
-        # Generate TTS using Gemini
-        tts_model = genai.GenerativeModel(TTS_MODEL_NAME)
-        # Gemini TTS detects language from the text.
-        # The voice selection is typically handled by the model or default voice for the detected language.
-        # The user's snippet for `speech_config` and `voice_name='Kore'` is not directly compatible
-        # with the current `google-generativeai` SDK's `GenerativeModel.generate_content` method
-        # in a straightforward way. This method uses `response_mime_type` for audio output.
-        tts_generation_config = genai_types.GenerationConfig(
-            response_mime_type="audio/wav" # Gemini will output WAV audio
-        )
-        # The content for TTS is just the translated text.
-        tts_response = tts_model.generate_content(
-            contents=[translated_text], # Make sure contents is an iterable of Parts or strings
-            generation_config=tts_generation_config
-        )
-        if not (tts_response.candidates and tts_response.candidates[0].content.parts):
-            raise ValueError("Gemini TTS did not return audio data.")
-        audio_pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data
-        _, temp_output_path = tempfile.mkstemp(suffix=".wav")
-        # Default parameters from the user's example: rate=24000, sample_width=2 (16-bit), channels=1
-        save_wave_file(temp_output_path, audio_pcm_data, channels=1, sample_width=2, frame_rate=24000)
         return jsonify({
             'transcription': transcription,
@@ -148,27 +114,20 @@ def translate_audio():
         })
     except Exception as e:
-        app.logger.error(f"Error processing request: {str(e)}", exc_info=True)
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')
 def download_file(filename):
     try:
-        # tempfile.gettempdir() is the directory where mkstemp creates files
-        file_path = os.path.join(tempfile.gettempdir(), filename)
         return send_file(
-            file_path,
-            mimetype="audio/wav", # Changed from mpeg to wav
             as_attachment=True,
-            download_name=f"translated_{filename.replace(tempfile.gettempdir(), '')}" # Cleaner name
         )
     except FileNotFoundError:
         return jsonify({'error': 'File not found'}), 404
-    except Exception as e:
-        app.logger.error(f"Error downloading file: {str(e)}", exc_info=True)
-        return jsonify({'error': f"Error downloading file: {str(e)}"}), 500
 if __name__ == '__main__':
-    # Consider adding an environment variable for debug mode for production
-    app.run(host="0.0.0.0", port=7860) # Added debug=True for development

 import os
+import numpy as np
 from flask import Flask, request, jsonify, send_file, send_from_directory
 import google.generativeai as genai
+from gtts import gTTS, lang
 import tempfile
+import soundfile as sf
+from kokoro import KPipeline
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 app = Flask(__name__, static_folder='static')
 CORS(app)
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
+# Language configurations
+KOKORO_LANGUAGES = {
+    "American English": "a",
+    "British English": "b",
+    "Mandarin Chinese": "z",
+    "Spanish": "e",
+    "French": "f",
+    "Hindi": "h",
+    "Italian": "i",
+    "Brazilian Portuguese": "p"
 }
+GTTS_LANGUAGES = lang.tts_langs()
+GTTS_LANGUAGES['ja'] = 'Japanese'  # Explicit Japanese support
+SUPPORTED_LANGUAGES = sorted(
+    list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
+)
 @app.route('/')
 def serve_index():
             return jsonify({'error': 'No audio file uploaded'}), 400
         audio_file = request.files['audio']
+        target_language = request.form.get('language', 'English')
         if not audio_file or audio_file.filename == '':
             return jsonify({'error': 'Invalid audio file'}), 400
+        # Validate MIME type
+        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
         if audio_file.mimetype not in allowed_mime_types:
+            return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
+        # Transcribe audio using Gemini
+        model = genai.GenerativeModel("gemini-2.0-flash")
+        # Create proper audio blob
+        audio_blob = {
+            'mime_type': audio_file.mimetype,
+            'data': audio_file.read()
+        }
         # Get transcription
+        convo = model.start_chat()
+        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
+        response = convo.send_message(audio_blob)
         transcription = response.text.strip()
         # Translate text using Gemini
+        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
+        response = model.generate_content(prompt)
         translated_text = response.text.strip()
+        # Generate TTS
+        if target_language in KOKORO_LANGUAGES:
+            lang_code = KOKORO_LANGUAGES[target_language]
+            pipeline = KPipeline(lang_code=lang_code)
+            generator = pipeline(translated_text, voice="af_heart", speed=1)
+            # Collect all audio segments
+            audio_segments = []
+            for _, _, audio in generator:
+                if audio is not None:
+                    audio_segments.append(audio)
+            if audio_segments:
+                audio_data = np.concatenate(audio_segments)
+                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
+                sf.write(temp_output_path, audio_data, 24000)
+            else:
+                raise ValueError("No audio generated by Kokoro")
+        else:
+            # Standard gTTS handling
+            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
+            tts = gTTS(translated_text, lang=lang_code)
+            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
+            tts.save(temp_output_path)
         return jsonify({
             'transcription': transcription,
         })
     except Exception as e:
+        app.logger.error(f"Error processing request: {str(e)}")
         return jsonify({'error': str(e)}), 500
 @app.route('/download/<filename>')
 def download_file(filename):
     try:
         return send_file(
+            os.path.join(tempfile.gettempdir(), filename),
+            mimetype="audio/mpeg",
             as_attachment=True,
+            download_name=f"translated_{filename}"
         )
     except FileNotFoundError:
         return jsonify({'error': 'File not found'}), 404
 if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=7860)