Spaces:

Athspi-ai
/

Audio-translation

Running

File size: 4,787 Bytes

6f05665
7582b7f
d0dd39c
ab0df5d
dbe8a71
c07d698
6ebed08
 
7cc4829
 
dbe8a71
6ebed08
7cc4829
 
 
413a70d
 
7cc4829
ab0df5d
dbe8a71
7cc4829
6ebed08
 
 
 
 
 
 
 
 
 
 
 
7cc4829
317b2f2
 
514903a
7cc4829
 
 
 
 
 
 
 
dbe8a71
7cc4829
 
dbe8a71
7cc4829
 
 
 
 
 
 
 
 
965bd2d
 
 
 
6ebed08
 
 
11a3089
965bd2d
 
 
 
 
 
 
 
 
 
c07d698
6ebed08
 
965bd2d
6ebed08
 
7cc4829
7582b7f
6ebed08
 
 
 
7582b7f
 
 
 
 
 
 
 
 
 
6ebed08
 
7582b7f
 
6ebed08
7582b7f
6ebed08
 
 
 
7cc4829
 
 
 
 
 
 
dbe8a71
6ebed08
7cc4829
dbe8a71
7cc4829
 
 
 
 
 
 
 
ef2c8e0
7cc4829
 
dbe8a71
7cc4829
1c39787

import os
import numpy as np
from flask import Flask, request, jsonify, send_file, send_from_directory
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline
from werkzeug.utils import secure_filename
from flask_cors import CORS

app = Flask(__name__, static_folder='static')
CORS(app)

# Configure Gemini API
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set")
genai.configure(api_key=GEMINI_API_KEY)

# Language configurations
KOKORO_LANGUAGES = {
    "American English": "a",
    "British English": "b",
    "Japanese": "j",
    "Mandarin Chinese": "z",
    "Spanish": "e",
    "French": "f",
    "Hindi": "h",
    "Italian": "i",
    "Brazilian Portuguese": "p"
}

GTTS_LANGUAGES = lang.tts_langs()

# Combine languages and remove duplicates
SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))

@app.route('/')
def serve_index():
    return send_from_directory(app.static_folder, 'index.html')

@app.route('/languages')
def get_languages():
    return jsonify(SUPPORTED_LANGUAGES)

@app.route('/translate', methods=['POST'])
def translate_audio():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file uploaded'}), 400
            
        audio_file = request.files['audio']
        target_language = request.form.get('language', 'English')
        
        if not audio_file or audio_file.filename == '':
            return jsonify({'error': 'Invalid audio file'}), 400

        # Validate MIME type
        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
        if audio_file.mimetype not in allowed_mime_types:
            return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400

        # Transcribe audio using Gemini
        model = genai.GenerativeModel("gemini-2.0-flash")
        
        # Create proper audio blob
        audio_blob = {
            'mime_type': audio_file.mimetype,
            'data': audio_file.read()
        }

        # Get transcription
        convo = model.start_chat()
        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
        response = convo.send_message(audio_blob)
        transcription = response.text.strip()

        # Translate text using Gemini
        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
        response = model.generate_content(prompt)
        translated_text = response.text.strip()
        
        # Generate TTS (corrected version)
        if target_language in KOKORO_LANGUAGES:
            lang_code = KOKORO_LANGUAGES[target_language]
            pipeline = KPipeline(lang_code=lang_code)
            generator = pipeline(translated_text, voice="af_heart", speed=1)
            
            # Collect all audio segments
            audio_segments = []
            for _, _, audio in generator:
                if audio is not None:  # Explicit None check
                    audio_segments.append(audio)
            
            if audio_segments:
                # Concatenate audio tensors
                audio_data = np.concatenate(audio_segments)
                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
                sf.write(temp_output_path, audio_data, 24000)
            else:
                raise ValueError("No audio generated by Kokoro")
        else:
            # Fallback to gTTS
            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
            tts = gTTS(translated_text, lang=lang_code)
            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
            tts.save(temp_output_path)
        
        return jsonify({
            'transcription': transcription,
            'translation': translated_text,
            'audio_url': f'/download/{os.path.basename(temp_output_path)}'
        })
        
    except Exception as e:
        app.logger.error(f"Error processing request: {str(e)}")
        return jsonify({'error': str(e)}), 500

@app.route('/download/<filename>')
def download_file(filename):
    try:
        return send_file(
            os.path.join(tempfile.gettempdir(), filename),
            mimetype="audio/mpeg",
            as_attachment=True,
            download_name=f"translated_{filename}"
        )
    except FileNotFoundError:
        return jsonify({'error': 'File not found'}), 404

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=7860)