Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Audio-translation / app.py

Athspi

Update app.py

e51d62b verified 3 months ago

raw

history blame

6.42 kB

	import os
	import tempfile
	import wave
	import numpy as np
	import soundfile as sf

	from flask import Flask, request, jsonify, send_file, send_from_directory
	from flask_cors import CORS
	from werkzeug.utils import secure_filename

	from gtts import gTTS, lang
	from kokoro import KPipeline

	import google.generativeai as genai
	from google.generativeai.types import (
	GenerateContentConfig,
	SpeechConfig,
	VoiceConfig,
	PrebuiltVoiceConfig,
	)

	# -----------------------------------------------------------------------------
	# Configuration
	# -----------------------------------------------------------------------------

	# 1) Make sure you've run:
	# pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
	#
	# 2) Set your Gemini API key in the environment:
	# export GEMINI_API_KEY="your_real_api_key_here"

	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if not GEMINI_API_KEY:
	raise RuntimeError("GEMINI_API_KEY environment variable not set")

	genai.configure(api_key=GEMINI_API_KEY)
	client = genai.Client(api_key=GEMINI_API_KEY)

	# Kokoro and gTTS language maps
	KOKORO_LANGUAGES = {
	"American English": "a",
	"British English": "b",
	"Mandarin Chinese": "z",
	"Spanish": "e",
	"French": "f",
	"Hindi": "h",
	"Italian": "i",
	"Brazilian Portuguese": "p",
	}
	GTTS_LANGUAGES = lang.tts_langs()
	GTTS_LANGUAGES["ja"] = "Japanese" # explicit Japanese support

	SUPPORTED_LANGUAGES = sorted(
	set(KOKORO_LANGUAGES.keys()) \| set(GTTS_LANGUAGES.values())
	)

	# Voice name for Gemini TTS preview
	GEMINI_VOICE_NAME = "Kore"

	# -----------------------------------------------------------------------------
	# Helpers
	# -----------------------------------------------------------------------------

	def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
	"""Write raw PCM into a .wav file."""
	with wave.open(filename, "wb") as wf:
	wf.setnchannels(channels)
	wf.setsampwidth(sample_width)
	wf.setframerate(rate)
	wf.writeframes(pcm)

	# -----------------------------------------------------------------------------
	# Flask App
	# -----------------------------------------------------------------------------

	app = Flask(__name__, static_folder="static")
	CORS(app)


	@app.route("/")
	def serve_index():
	# serve your index.html from ./static/index.html
	return send_from_directory(app.static_folder, "index.html")


	@app.route("/languages")
	def list_languages():
	return jsonify(SUPPORTED_LANGUAGES)


	@app.route("/translate", methods=["POST"])
	def translate_audio():
	try:
	# 1. Receive file + target language
	if "audio" not in request.files:
	return jsonify(error="No audio file uploaded"), 400

	audio_file = request.files["audio"]
	target_lang = request.form.get("language", "English")

	if not audio_file or audio_file.filename == "":
	return jsonify(error="Invalid audio file"), 400

	# 2. Validate MIME type
	if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
	return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400

	# 3. Transcribe with Gemini
	model = genai.GenerativeModel("gemini-2.0-flash")
	blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}

	convo = model.start_chat()
	convo.send_message(
	"You are a professional transcriber. Transcribe this audio accurately, verbatim."
	)
	resp = convo.send_message(blob)
	transcription = resp.text.strip()

	# 4. Translate with Gemini
	prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
	translation_resp = model.generate_content(prompt)
	translated_text = translation_resp.text.strip()

	# 5. Try Gemini TTS 2.5 preview
	try:
	tts_resp = client.models.generate_content(
	model="gemini-2.5-flash-preview-tts",
	contents=translated_text,
	config=GenerateContentConfig(
	response_modalities=["AUDIO"],
	speech_config=SpeechConfig(
	voice_config=VoiceConfig(
	prebuilt_voice_config=PrebuiltVoiceConfig(
	voice_name=GEMINI_VOICE_NAME
	)
	)
	),
	),
	)
	pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
	out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
	wave_file(out_path, pcm_data)

	except Exception:
	# Fallback: Kokoro
	if target_lang in KOKORO_LANGUAGES:
	code = KOKORO_LANGUAGES[target_lang]
	pipeline = KPipeline(lang_code=code)
	generator = pipeline(translated_text, voice="af_heart", speed=1)

	segments = [audio for _, _, audio in generator if audio is not None]
	if segments:
	arr = np.concatenate(segments)
	out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
	sf.write(out_path, arr, 24000)
	else:
	raise RuntimeError("Kokoro produced no audio")

	# Final fallback: gTTS
	else:
	gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
	tts = gTTS(translated_text, lang=gtts_code)
	out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
	tts.save(out_path)

	return jsonify(
	transcription=transcription,
	translation=translated_text,
	audio_url=f"/download/{os.path.basename(out_path)}",
	)

	except Exception as e:
	app.logger.exception("Error in /translate")
	return jsonify(error=str(e)), 500


	@app.route("/download/<filename>")
	def download_file(filename):
	path = os.path.join(tempfile.gettempdir(), filename)
	if not os.path.isfile(path):
	return jsonify(error="File not found"), 404
	return send_file(path, as_attachment=True, download_name=f"translated_{filename}")


	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)