Athspi's picture
Update app.py
7cc4829 verified
raw
history blame
4.15 kB
import os
from flask import Flask, request, jsonify, send_file, send_from_directory
from faster_whisper import WhisperModel
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline
from werkzeug.utils import secure_filename
from flask_cors import CORS
app = Flask(__name__, static_folder='static')
CORS(app)
# Configure Gemini API
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("GEMINI_API_KEY environment variable not set")
genai.configure(api_key=GEMINI_API_KEY)
# Initialize Whisper model
model_size = "Systran/faster-whisper-large-v3"
try:
whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
except ValueError:
whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
# Language configurations
KOKORO_LANGUAGES = {
"American English": "a",
"British English": "b",
"Japanese": "j",
"Mandarin Chinese": "z",
"Spanish": "e",
"French": "f",
"Hindi": "h",
"Italian": "i",
"Brazilian Portuguese": "p"
}
GTTS_LANGUAGES = lang.tts_langs()
SUPPORTED_LANGUAGES = sorted(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))
@app.route('/')
def serve_index():
return send_from_directory(app.static_folder, 'index.html')
@app.route('/languages')
def get_languages():
return jsonify(SUPPORTED_LANGUAGES)
@app.route('/translate', methods=['POST'])
def translate_audio():
try:
if 'audio' not in request.files:
return jsonify({'error': 'No audio file uploaded'}), 400
audio_file = request.files['audio']
target_language = request.form.get('language', 'English')
if not audio_file or audio_file.filename == '':
return jsonify({'error': 'Invalid audio file'}), 400
# Save temporary audio file
filename = secure_filename(audio_file.filename)
temp_input_path = os.path.join(tempfile.gettempdir(), filename)
audio_file.save(temp_input_path)
# Transcribe audio
segments, info = whisper_model.transcribe(temp_input_path, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
# Translate text
model = genai.GenerativeModel("gemini-2.0-flash")
prompt = f"Translate to {target_language} preserving meaning and cultural nuances:\n\n{transcription}"
response = model.generate_content(prompt)
translated_text = response.text.strip()
# Generate TTS
if target_language in KOKORO_LANGUAGES:
lang_code = KOKORO_LANGUAGES[target_language]
pipeline = KPipeline(lang_code=lang_code)
generator = pipeline(translated_text, voice="af_heart", speed=1)
audio_data = next((audio for _, _, audio in generator), None)
if audio_data:
_, temp_output_path = tempfile.mkstemp(suffix=".wav")
sf.write(temp_output_path, audio_data, 24000)
else:
lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
tts = gTTS(translated_text, lang=lang_code)
_, temp_output_path = tempfile.mkstemp(suffix=".mp3")
tts.save(temp_output_path)
return jsonify({
'transcription': transcription,
'translation': translated_text,
'audio_url': f'/download/{os.path.basename(temp_output_path)}'
})
except Exception as e:
app.logger.error(f"Error processing request: {str(e)}")
return jsonify({'error': str(e)}), 500
@app.route('/download/<filename>')
def download_file(filename):
try:
return send_file(
os.path.join(tempfile.gettempdir(), filename),
mimetype="audio/mpeg",
as_attachment=True,
download_name=f"translated_{filename}"
)
except FileNotFoundError:
return jsonify({'error': 'File not found'}), 404
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)