Athspi's picture
Update app.py
e51d62b verified
raw
history blame
6.42 kB
import os
import tempfile
import wave
import numpy as np
import soundfile as sf
from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename
from gtts import gTTS, lang
from kokoro import KPipeline
import google.generativeai as genai
from google.generativeai.types import (
GenerateContentConfig,
SpeechConfig,
VoiceConfig,
PrebuiltVoiceConfig,
)
# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
# 1) Make sure you've run:
# pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
#
# 2) Set your Gemini API key in the environment:
# export GEMINI_API_KEY="your_real_api_key_here"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise RuntimeError("GEMINI_API_KEY environment variable not set")
genai.configure(api_key=GEMINI_API_KEY)
client = genai.Client(api_key=GEMINI_API_KEY)
# Kokoro and gTTS language maps
KOKORO_LANGUAGES = {
"American English": "a",
"British English": "b",
"Mandarin Chinese": "z",
"Spanish": "e",
"French": "f",
"Hindi": "h",
"Italian": "i",
"Brazilian Portuguese": "p",
}
GTTS_LANGUAGES = lang.tts_langs()
GTTS_LANGUAGES["ja"] = "Japanese" # explicit Japanese support
SUPPORTED_LANGUAGES = sorted(
set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values())
)
# Voice name for Gemini TTS preview
GEMINI_VOICE_NAME = "Kore"
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
"""Write raw PCM into a .wav file."""
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
# -----------------------------------------------------------------------------
# Flask App
# -----------------------------------------------------------------------------
app = Flask(__name__, static_folder="static")
CORS(app)
@app.route("/")
def serve_index():
# serve your index.html from ./static/index.html
return send_from_directory(app.static_folder, "index.html")
@app.route("/languages")
def list_languages():
return jsonify(SUPPORTED_LANGUAGES)
@app.route("/translate", methods=["POST"])
def translate_audio():
try:
# 1. Receive file + target language
if "audio" not in request.files:
return jsonify(error="No audio file uploaded"), 400
audio_file = request.files["audio"]
target_lang = request.form.get("language", "English")
if not audio_file or audio_file.filename == "":
return jsonify(error="Invalid audio file"), 400
# 2. Validate MIME type
if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400
# 3. Transcribe with Gemini
model = genai.GenerativeModel("gemini-2.0-flash")
blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}
convo = model.start_chat()
convo.send_message(
"You are a professional transcriber. Transcribe this audio accurately, verbatim."
)
resp = convo.send_message(blob)
transcription = resp.text.strip()
# 4. Translate with Gemini
prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
translation_resp = model.generate_content(prompt)
translated_text = translation_resp.text.strip()
# 5. Try Gemini TTS 2.5 preview
try:
tts_resp = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=translated_text,
config=GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=SpeechConfig(
voice_config=VoiceConfig(
prebuilt_voice_config=PrebuiltVoiceConfig(
voice_name=GEMINI_VOICE_NAME
)
)
),
),
)
pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
wave_file(out_path, pcm_data)
except Exception:
# Fallback: Kokoro
if target_lang in KOKORO_LANGUAGES:
code = KOKORO_LANGUAGES[target_lang]
pipeline = KPipeline(lang_code=code)
generator = pipeline(translated_text, voice="af_heart", speed=1)
segments = [audio for _, _, audio in generator if audio is not None]
if segments:
arr = np.concatenate(segments)
out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
sf.write(out_path, arr, 24000)
else:
raise RuntimeError("Kokoro produced no audio")
# Final fallback: gTTS
else:
gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
tts = gTTS(translated_text, lang=gtts_code)
out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
tts.save(out_path)
return jsonify(
transcription=transcription,
translation=translated_text,
audio_url=f"/download/{os.path.basename(out_path)}",
)
except Exception as e:
app.logger.exception("Error in /translate")
return jsonify(error=str(e)), 500
@app.route("/download/<filename>")
def download_file(filename):
path = os.path.join(tempfile.gettempdir(), filename)
if not os.path.isfile(path):
return jsonify(error="File not found"), 404
return send_file(path, as_attachment=True, download_name=f"translated_{filename}")
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)