Spaces:

Athspi-ai
/

Audio-translation

Running

File size: 6,421 Bytes

5f33e0e
385365a
 
9ffbfd1
385365a
 
f49c906
385365a
 
 
f49c906
bfc5175
 
280b5d0
e51d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cc4829
5ddb059
 
e51d62b
bfc5175
280b5d0
e51d62b
385365a
e51d62b
280b5d0
e51d62b
 
 
 
 
 
 
 
280b5d0
f49c906
e51d62b
 
 
 
 
bfc5175
e51d62b
 
385365a
e51d62b
 
 
 
 
 
385365a
 
 
 
 
6ebed08
e51d62b
 
 
 
 
 
 
 
 
dbed07a
e51d62b
 
63a0fca
e51d62b
 
 
dbed07a
965bd2d
e51d62b
 
5ddb059
6c131f6
e51d62b
 
 
385365a
e51d62b
 
385365a
e51d62b
 
5ddb059
e51d62b
 
 
5ddb059
e51d62b
 
 
f49c906
9ffbfd1
e51d62b
 
 
 
 
dbed07a
e51d62b
 
 
 
385365a
e51d62b
385365a
e51d62b
 
385365a
e51d62b
 
 
 
 
 
 
 
 
280b5d0
385365a
e51d62b
 
 
 
 
 
 
 
 
385365a
 
e51d62b
 
 
 
 
385365a
e51d62b
385365a
e51d62b
 
 
 
 
 
 
 
 
 
 
 
385365a
5ddb059
e51d62b
 
 
dbe8a71
e51d62b
7cc4829
e51d62b
 
 
 
 
dbe8a71
e51d62b
9ffbfd1

import os
import tempfile
import wave
import numpy as np
import soundfile as sf

from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename

from gtts import gTTS, lang
from kokoro import KPipeline

import google.generativeai as genai
from google.generativeai.types import (
    GenerateContentConfig,
    SpeechConfig,
    VoiceConfig,
    PrebuiltVoiceConfig,
)

# -----------------------------------------------------------------------------
#  Configuration
# -----------------------------------------------------------------------------

# 1) Make sure you've run:
#      pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
#
# 2) Set your Gemini API key in the environment:
#      export GEMINI_API_KEY="your_real_api_key_here"

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY environment variable not set")

genai.configure(api_key=GEMINI_API_KEY)
client = genai.Client(api_key=GEMINI_API_KEY)

# Kokoro and gTTS language maps
KOKORO_LANGUAGES = {
    "American English": "a",
    "British English": "b",
    "Mandarin Chinese": "z",
    "Spanish": "e",
    "French": "f",
    "Hindi": "h",
    "Italian": "i",
    "Brazilian Portuguese": "p",
}
GTTS_LANGUAGES = lang.tts_langs()
GTTS_LANGUAGES["ja"] = "Japanese"  # explicit Japanese support

SUPPORTED_LANGUAGES = sorted(
    set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values())
)

# Voice name for Gemini TTS preview
GEMINI_VOICE_NAME = "Kore"

# -----------------------------------------------------------------------------
#  Helpers
# -----------------------------------------------------------------------------

def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
    """Write raw PCM into a .wav file."""
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        wf.writeframes(pcm)

# -----------------------------------------------------------------------------
#  Flask App
# -----------------------------------------------------------------------------

app = Flask(__name__, static_folder="static")
CORS(app)


@app.route("/")
def serve_index():
    # serve your index.html from ./static/index.html
    return send_from_directory(app.static_folder, "index.html")


@app.route("/languages")
def list_languages():
    return jsonify(SUPPORTED_LANGUAGES)


@app.route("/translate", methods=["POST"])
def translate_audio():
    try:
        # 1. Receive file + target language
        if "audio" not in request.files:
            return jsonify(error="No audio file uploaded"), 400

        audio_file = request.files["audio"]
        target_lang = request.form.get("language", "English")

        if not audio_file or audio_file.filename == "":
            return jsonify(error="Invalid audio file"), 400

        # 2. Validate MIME type
        if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
            return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400

        # 3. Transcribe with Gemini
        model = genai.GenerativeModel("gemini-2.0-flash")
        blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}

        convo = model.start_chat()
        convo.send_message(
            "You are a professional transcriber. Transcribe this audio accurately, verbatim."
        )
        resp = convo.send_message(blob)
        transcription = resp.text.strip()

        # 4. Translate with Gemini
        prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
        translation_resp = model.generate_content(prompt)
        translated_text = translation_resp.text.strip()

        # 5. Try Gemini TTS 2.5 preview
        try:
            tts_resp = client.models.generate_content(
                model="gemini-2.5-flash-preview-tts",
                contents=translated_text,
                config=GenerateContentConfig(
                    response_modalities=["AUDIO"],
                    speech_config=SpeechConfig(
                        voice_config=VoiceConfig(
                            prebuilt_voice_config=PrebuiltVoiceConfig(
                                voice_name=GEMINI_VOICE_NAME
                            )
                        )
                    ),
                ),
            )
            pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
            out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
            wave_file(out_path, pcm_data)

        except Exception:
            # Fallback: Kokoro
            if target_lang in KOKORO_LANGUAGES:
                code = KOKORO_LANGUAGES[target_lang]
                pipeline = KPipeline(lang_code=code)
                generator = pipeline(translated_text, voice="af_heart", speed=1)

                segments = [audio for _, _, audio in generator if audio is not None]
                if segments:
                    arr = np.concatenate(segments)
                    out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
                    sf.write(out_path, arr, 24000)
                else:
                    raise RuntimeError("Kokoro produced no audio")

            # Final fallback: gTTS
            else:
                gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
                tts = gTTS(translated_text, lang=gtts_code)
                out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
                tts.save(out_path)

        return jsonify(
            transcription=transcription,
            translation=translated_text,
            audio_url=f"/download/{os.path.basename(out_path)}",
        )

    except Exception as e:
        app.logger.exception("Error in /translate")
        return jsonify(error=str(e)), 500


@app.route("/download/<filename>")
def download_file(filename):
    path = os.path.join(tempfile.gettempdir(), filename)
    if not os.path.isfile(path):
        return jsonify(error="File not found"), 404
    return send_file(path, as_attachment=True, download_name=f"translated_{filename}")


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)