Spaces:
Running
Running
import os | |
import tempfile | |
import wave | |
import numpy as np | |
import soundfile as sf | |
from flask import Flask, request, jsonify, send_file, send_from_directory | |
from flask_cors import CORS | |
from werkzeug.utils import secure_filename | |
from gtts import gTTS, lang | |
from kokoro import KPipeline | |
import google.generativeai as genai | |
from google.generativeai.types import ( | |
GenerateContentConfig, | |
SpeechConfig, | |
VoiceConfig, | |
PrebuiltVoiceConfig, | |
) | |
# ----------------------------------------------------------------------------- | |
# Configuration | |
# ----------------------------------------------------------------------------- | |
# 1) Make sure you've run: | |
# pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug | |
# | |
# 2) Set your Gemini API key in the environment: | |
# export GEMINI_API_KEY="your_real_api_key_here" | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
if not GEMINI_API_KEY: | |
raise RuntimeError("GEMINI_API_KEY environment variable not set") | |
genai.configure(api_key=GEMINI_API_KEY) | |
client = genai.Client(api_key=GEMINI_API_KEY) | |
# Kokoro and gTTS language maps | |
KOKORO_LANGUAGES = { | |
"American English": "a", | |
"British English": "b", | |
"Mandarin Chinese": "z", | |
"Spanish": "e", | |
"French": "f", | |
"Hindi": "h", | |
"Italian": "i", | |
"Brazilian Portuguese": "p", | |
} | |
GTTS_LANGUAGES = lang.tts_langs() | |
GTTS_LANGUAGES["ja"] = "Japanese" # explicit Japanese support | |
SUPPORTED_LANGUAGES = sorted( | |
set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values()) | |
) | |
# Voice name for Gemini TTS preview | |
GEMINI_VOICE_NAME = "Kore" | |
# ----------------------------------------------------------------------------- | |
# Helpers | |
# ----------------------------------------------------------------------------- | |
def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2): | |
"""Write raw PCM into a .wav file.""" | |
with wave.open(filename, "wb") as wf: | |
wf.setnchannels(channels) | |
wf.setsampwidth(sample_width) | |
wf.setframerate(rate) | |
wf.writeframes(pcm) | |
# ----------------------------------------------------------------------------- | |
# Flask App | |
# ----------------------------------------------------------------------------- | |
app = Flask(__name__, static_folder="static") | |
CORS(app) | |
def serve_index(): | |
# serve your index.html from ./static/index.html | |
return send_from_directory(app.static_folder, "index.html") | |
def list_languages(): | |
return jsonify(SUPPORTED_LANGUAGES) | |
def translate_audio(): | |
try: | |
# 1. Receive file + target language | |
if "audio" not in request.files: | |
return jsonify(error="No audio file uploaded"), 400 | |
audio_file = request.files["audio"] | |
target_lang = request.form.get("language", "English") | |
if not audio_file or audio_file.filename == "": | |
return jsonify(error="Invalid audio file"), 400 | |
# 2. Validate MIME type | |
if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"): | |
return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400 | |
# 3. Transcribe with Gemini | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()} | |
convo = model.start_chat() | |
convo.send_message( | |
"You are a professional transcriber. Transcribe this audio accurately, verbatim." | |
) | |
resp = convo.send_message(blob) | |
transcription = resp.text.strip() | |
# 4. Translate with Gemini | |
prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}" | |
translation_resp = model.generate_content(prompt) | |
translated_text = translation_resp.text.strip() | |
# 5. Try Gemini TTS 2.5 preview | |
try: | |
tts_resp = client.models.generate_content( | |
model="gemini-2.5-flash-preview-tts", | |
contents=translated_text, | |
config=GenerateContentConfig( | |
response_modalities=["AUDIO"], | |
speech_config=SpeechConfig( | |
voice_config=VoiceConfig( | |
prebuilt_voice_config=PrebuiltVoiceConfig( | |
voice_name=GEMINI_VOICE_NAME | |
) | |
) | |
), | |
), | |
) | |
pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data | |
out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav") | |
wave_file(out_path, pcm_data) | |
except Exception: | |
# Fallback: Kokoro | |
if target_lang in KOKORO_LANGUAGES: | |
code = KOKORO_LANGUAGES[target_lang] | |
pipeline = KPipeline(lang_code=code) | |
generator = pipeline(translated_text, voice="af_heart", speed=1) | |
segments = [audio for _, _, audio in generator if audio is not None] | |
if segments: | |
arr = np.concatenate(segments) | |
out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav") | |
sf.write(out_path, arr, 24000) | |
else: | |
raise RuntimeError("Kokoro produced no audio") | |
# Final fallback: gTTS | |
else: | |
gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en") | |
tts = gTTS(translated_text, lang=gtts_code) | |
out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3") | |
tts.save(out_path) | |
return jsonify( | |
transcription=transcription, | |
translation=translated_text, | |
audio_url=f"/download/{os.path.basename(out_path)}", | |
) | |
except Exception as e: | |
app.logger.exception("Error in /translate") | |
return jsonify(error=str(e)), 500 | |
def download_file(filename): | |
path = os.path.join(tempfile.gettempdir(), filename) | |
if not os.path.isfile(path): | |
return jsonify(error="File not found"), 404 | |
return send_file(path, as_attachment=True, download_name=f"translated_{filename}") | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860) |