Spaces:

Athspi-ai
/

Audio-translation

Running

File size: 6,635 Bytes

5f33e0e
9ffbfd1
c07d698
9ffbfd1
bfc5175
 
 
7cc4829
bfc5175
 
 
 
dbe8a71
bfc5175
 
 
 
6ebed08
5ddb059
7cc4829
bfc5175
5ddb059
 
 
 
bfc5175
dbe8a71
bfc5175
9ffbfd1
bfc5175
 
6ebed08
9ffbfd1
bfc5175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317b2f2
6ebed08
dbed07a
 
 
63a0fca
bfc5175
dbed07a
 
 
965bd2d
bfc5175
5ddb059
 
6c131f6
5ddb059
 
bfc5175
5ddb059
9ffbfd1
bfc5175
5ddb059
 
 
9ffbfd1
dbed07a
9ffbfd1
5ddb059
9ffbfd1
bfc5175
 
9ffbfd1
 
 
dbed07a
 
9ffbfd1
 
6ebed08
bfc5175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ffbfd1
 
 
bfc5175
9ffbfd1
 
 
 
bfc5175
9ffbfd1
 
bfc5175
 
9ffbfd1
 
 
 
 
bfc5175
 
 
5ddb059
7cc4829
 
bfc5175
5ddb059
bfc5175
5ddb059
9ffbfd1
5ddb059
dbe8a71
bfc5175
7cc4829
 
 
 
9ffbfd1
 
7cc4829
9ffbfd1
ef2c8e0
dbed07a
 
dbe8a71
bfc5175
 
 
 
 
 
 
 
 
 
 
 
7cc4829
9ffbfd1

import os
import numpy as np
import tempfile
import soundfile as sf
import wave

from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename

from kokoro import KPipeline
from gtts import gTTS, lang

from google import generativeai as genai
from google.genai import types

# Flask app setup
app = Flask(__name__, static_folder='static')
CORS(app)

# Gemini API configuration
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set")
genai.configure(api_key=GEMINI_API_KEY)
client = genai.Client(api_key=GEMINI_API_KEY)

# Language support
KOKORO_LANGUAGES = {
    "American English": "a", "British English": "b", "Mandarin Chinese": "z",
    "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"
}
GTTS_LANGUAGES = lang.tts_langs()
GTTS_LANGUAGES['ja'] = 'Japanese'
SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))

GEMINI_VOICES = {
    "ar-EG": "Kore", "de-DE": "Kore", "en-US": "Kore", "es-US": "Kore", "fr-FR": "Kore",
    "hi-IN": "Kore", "id-ID": "Kore", "it-IT": "Kore", "ja-JP": "Kore", "ko-KR": "Kore",
    "pt-BR": "Kore", "ru-RU": "Kore", "nl-NL": "Kore", "pl-PL": "Kore", "th-TH": "Kore",
    "tr-TR": "Kore", "vi-VN": "Kore", "ro-RO": "Kore", "uk-UA": "Kore", "bn-BD": "Kore",
    "en-IN": "Kore", "mr-IN": "Kore", "ta-IN": "Kore", "te-IN": "Kore"
}


def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        wf.writeframes(pcm)


@app.route('/')
def serve_index():
    return send_from_directory(app.static_folder, 'index.html')


@app.route('/languages')
def get_languages():
    return jsonify(SUPPORTED_LANGUAGES)


@app.route('/translate', methods=['POST'])
def translate_audio():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file uploaded'}), 400

        audio_file = request.files['audio']
        target_language = request.form.get('language', 'English')

        if not audio_file or audio_file.filename == '':
            return jsonify({'error': 'Invalid audio file'}), 400

        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
        if audio_file.mimetype not in allowed_mime_types:
            return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400

        model = genai.GenerativeModel("gemini-2.0-flash")
        audio_blob = {'mime_type': audio_file.mimetype, 'data': audio_file.read()}

        convo = model.start_chat()
        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
        response = convo.send_message(audio_blob)
        transcription = response.text.strip()

        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
        response = model.generate_content(prompt)
        translated_text = response.text.strip()

        voice_name = GEMINI_VOICES.get(get_bcp47_code(target_language), None)

        if voice_name:
            response = client.models.generate_content(
                model="gemini-2.5-flash-preview-tts",
                contents=translated_text,
                config=types.GenerateContentConfig(
                    response_modalities=["AUDIO"],
                    speech_config=types.SpeechConfig(
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
                        )
                    )
                )
            )
            data = response.candidates[0].content.parts[0].inline_data.data
            temp_path = os.path.join(tempfile.gettempdir(), f'tts_{secure_filename(audio_file.filename)}.wav')
            wave_file(temp_path, data)
        elif target_language in KOKORO_LANGUAGES:
            lang_code = KOKORO_LANGUAGES[target_language]
            pipeline = KPipeline(lang_code=lang_code)
            generator = pipeline(translated_text, voice="af_heart", speed=1)

            audio_segments = []
            for _, _, audio in generator:
                if audio is not None:
                    audio_segments.append(audio)

            if audio_segments:
                audio_data = np.concatenate(audio_segments)
                temp_path = os.path.join(tempfile.gettempdir(), f'kokoro_{secure_filename(audio_file.filename)}.wav')
                sf.write(temp_path, audio_data, 24000)
            else:
                raise ValueError("No audio generated by Kokoro")
        else:
            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
            tts = gTTS(translated_text, lang=lang_code)
            temp_path = os.path.join(tempfile.gettempdir(), f'gtts_{secure_filename(audio_file.filename)}.mp3')
            tts.save(temp_path)

        return jsonify({
            'transcription': transcription,
            'translation': translated_text,
            'audio_url': f'/download/{os.path.basename(temp_path)}'
        })

    except Exception as e:
        app.logger.error(f"Error processing request: {str(e)}")
        return jsonify({'error': str(e)}), 500


@app.route('/download/<filename>')
def download_file(filename):
    try:
        return send_file(
            os.path.join(tempfile.gettempdir(), filename),
            mimetype="audio/mpeg",
            as_attachment=True,
            download_name=f"translated_{filename}"
        )
    except FileNotFoundError:
        return jsonify({'error': 'File not found'}), 404


def get_bcp47_code(language):
    bcp_map = {
        "Arabic": "ar-EG", "German": "de-DE", "English": "en-US", "Spanish": "es-US", "French": "fr-FR",
        "Hindi": "hi-IN", "Indonesian": "id-ID", "Italian": "it-IT", "Japanese": "ja-JP", "Korean": "ko-KR",
        "Portuguese": "pt-BR", "Russian": "ru-RU", "Dutch": "nl-NL", "Polish": "pl-PL", "Thai": "th-TH",
        "Turkish": "tr-TR", "Vietnamese": "vi-VN", "Romanian": "ro-RO", "Ukrainian": "uk-UA", "Bengali": "bn-BD",
        "Indian English": "en-IN", "Marathi": "mr-IN", "Tamil": "ta-IN", "Telugu": "te-IN"
    }
    return bcp_map.get(language)


if __name__ == '__main__':
    app.run(host="0.0.0.0", port=7860)