Spaces:
Running
Running
import os | |
import numpy as np | |
import tempfile | |
import soundfile as sf | |
import wave | |
from flask import Flask, request, jsonify, send_file, send_from_directory | |
from flask_cors import CORS | |
from werkzeug.utils import secure_filename | |
from kokoro import KPipeline | |
from gtts import gTTS, lang | |
from google import generativeai as genai | |
from google.genai import types | |
# Flask app setup | |
app = Flask(__name__, static_folder='static') | |
CORS(app) | |
# Gemini API configuration | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
if not GEMINI_API_KEY: | |
raise ValueError("GEMINI_API_KEY environment variable not set") | |
genai.configure(api_key=GEMINI_API_KEY) | |
client = genai.Client(api_key=GEMINI_API_KEY) | |
# Language support | |
KOKORO_LANGUAGES = { | |
"American English": "a", "British English": "b", "Mandarin Chinese": "z", | |
"Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p" | |
} | |
GTTS_LANGUAGES = lang.tts_langs() | |
GTTS_LANGUAGES['ja'] = 'Japanese' | |
SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))) | |
GEMINI_VOICES = { | |
"ar-EG": "Kore", "de-DE": "Kore", "en-US": "Kore", "es-US": "Kore", "fr-FR": "Kore", | |
"hi-IN": "Kore", "id-ID": "Kore", "it-IT": "Kore", "ja-JP": "Kore", "ko-KR": "Kore", | |
"pt-BR": "Kore", "ru-RU": "Kore", "nl-NL": "Kore", "pl-PL": "Kore", "th-TH": "Kore", | |
"tr-TR": "Kore", "vi-VN": "Kore", "ro-RO": "Kore", "uk-UA": "Kore", "bn-BD": "Kore", | |
"en-IN": "Kore", "mr-IN": "Kore", "ta-IN": "Kore", "te-IN": "Kore" | |
} | |
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2): | |
with wave.open(filename, "wb") as wf: | |
wf.setnchannels(channels) | |
wf.setsampwidth(sample_width) | |
wf.setframerate(rate) | |
wf.writeframes(pcm) | |
def serve_index(): | |
return send_from_directory(app.static_folder, 'index.html') | |
def get_languages(): | |
return jsonify(SUPPORTED_LANGUAGES) | |
def translate_audio(): | |
try: | |
if 'audio' not in request.files: | |
return jsonify({'error': 'No audio file uploaded'}), 400 | |
audio_file = request.files['audio'] | |
target_language = request.form.get('language', 'English') | |
if not audio_file or audio_file.filename == '': | |
return jsonify({'error': 'Invalid audio file'}), 400 | |
allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm'] | |
if audio_file.mimetype not in allowed_mime_types: | |
return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400 | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
audio_blob = {'mime_type': audio_file.mimetype, 'data': audio_file.read()} | |
convo = model.start_chat() | |
convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.") | |
response = convo.send_message(audio_blob) | |
transcription = response.text.strip() | |
prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}" | |
response = model.generate_content(prompt) | |
translated_text = response.text.strip() | |
voice_name = GEMINI_VOICES.get(get_bcp47_code(target_language), None) | |
if voice_name: | |
response = client.models.generate_content( | |
model="gemini-2.5-flash-preview-tts", | |
contents=translated_text, | |
config=types.GenerateContentConfig( | |
response_modalities=["AUDIO"], | |
speech_config=types.SpeechConfig( | |
voice_config=types.VoiceConfig( | |
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name) | |
) | |
) | |
) | |
) | |
data = response.candidates[0].content.parts[0].inline_data.data | |
temp_path = os.path.join(tempfile.gettempdir(), f'tts_{secure_filename(audio_file.filename)}.wav') | |
wave_file(temp_path, data) | |
elif target_language in KOKORO_LANGUAGES: | |
lang_code = KOKORO_LANGUAGES[target_language] | |
pipeline = KPipeline(lang_code=lang_code) | |
generator = pipeline(translated_text, voice="af_heart", speed=1) | |
audio_segments = [] | |
for _, _, audio in generator: | |
if audio is not None: | |
audio_segments.append(audio) | |
if audio_segments: | |
audio_data = np.concatenate(audio_segments) | |
temp_path = os.path.join(tempfile.gettempdir(), f'kokoro_{secure_filename(audio_file.filename)}.wav') | |
sf.write(temp_path, audio_data, 24000) | |
else: | |
raise ValueError("No audio generated by Kokoro") | |
else: | |
lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en') | |
tts = gTTS(translated_text, lang=lang_code) | |
temp_path = os.path.join(tempfile.gettempdir(), f'gtts_{secure_filename(audio_file.filename)}.mp3') | |
tts.save(temp_path) | |
return jsonify({ | |
'transcription': transcription, | |
'translation': translated_text, | |
'audio_url': f'/download/{os.path.basename(temp_path)}' | |
}) | |
except Exception as e: | |
app.logger.error(f"Error processing request: {str(e)}") | |
return jsonify({'error': str(e)}), 500 | |
def download_file(filename): | |
try: | |
return send_file( | |
os.path.join(tempfile.gettempdir(), filename), | |
mimetype="audio/mpeg", | |
as_attachment=True, | |
download_name=f"translated_{filename}" | |
) | |
except FileNotFoundError: | |
return jsonify({'error': 'File not found'}), 404 | |
def get_bcp47_code(language): | |
bcp_map = { | |
"Arabic": "ar-EG", "German": "de-DE", "English": "en-US", "Spanish": "es-US", "French": "fr-FR", | |
"Hindi": "hi-IN", "Indonesian": "id-ID", "Italian": "it-IT", "Japanese": "ja-JP", "Korean": "ko-KR", | |
"Portuguese": "pt-BR", "Russian": "ru-RU", "Dutch": "nl-NL", "Polish": "pl-PL", "Thai": "th-TH", | |
"Turkish": "tr-TR", "Vietnamese": "vi-VN", "Romanian": "ro-RO", "Ukrainian": "uk-UA", "Bengali": "bn-BD", | |
"Indian English": "en-IN", "Marathi": "mr-IN", "Tamil": "ta-IN", "Telugu": "te-IN" | |
} | |
return bcp_map.get(language) | |
if __name__ == '__main__': | |
app.run(host="0.0.0.0", port=7860) |