Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Audio-translation / app.py

Athspi

Update app.py

bfc5175 verified 3 months ago

raw

history blame

6.64 kB

	import os
	import numpy as np
	import tempfile
	import soundfile as sf
	import wave

	from flask import Flask, request, jsonify, send_file, send_from_directory
	from flask_cors import CORS
	from werkzeug.utils import secure_filename

	from kokoro import KPipeline
	from gtts import gTTS, lang

	from google import generativeai as genai
	from google.genai import types

	# Flask app setup
	app = Flask(__name__, static_folder='static')
	CORS(app)

	# Gemini API configuration
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if not GEMINI_API_KEY:
	raise ValueError("GEMINI_API_KEY environment variable not set")
	genai.configure(api_key=GEMINI_API_KEY)
	client = genai.Client(api_key=GEMINI_API_KEY)

	# Language support
	KOKORO_LANGUAGES = {
	"American English": "a", "British English": "b", "Mandarin Chinese": "z",
	"Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"
	}
	GTTS_LANGUAGES = lang.tts_langs()
	GTTS_LANGUAGES['ja'] = 'Japanese'
	SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))

	GEMINI_VOICES = {
	"ar-EG": "Kore", "de-DE": "Kore", "en-US": "Kore", "es-US": "Kore", "fr-FR": "Kore",
	"hi-IN": "Kore", "id-ID": "Kore", "it-IT": "Kore", "ja-JP": "Kore", "ko-KR": "Kore",
	"pt-BR": "Kore", "ru-RU": "Kore", "nl-NL": "Kore", "pl-PL": "Kore", "th-TH": "Kore",
	"tr-TR": "Kore", "vi-VN": "Kore", "ro-RO": "Kore", "uk-UA": "Kore", "bn-BD": "Kore",
	"en-IN": "Kore", "mr-IN": "Kore", "ta-IN": "Kore", "te-IN": "Kore"
	}


	def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
	with wave.open(filename, "wb") as wf:
	wf.setnchannels(channels)
	wf.setsampwidth(sample_width)
	wf.setframerate(rate)
	wf.writeframes(pcm)


	@app.route('/')
	def serve_index():
	return send_from_directory(app.static_folder, 'index.html')


	@app.route('/languages')
	def get_languages():
	return jsonify(SUPPORTED_LANGUAGES)


	@app.route('/translate', methods=['POST'])
	def translate_audio():
	try:
	if 'audio' not in request.files:
	return jsonify({'error': 'No audio file uploaded'}), 400

	audio_file = request.files['audio']
	target_language = request.form.get('language', 'English')

	if not audio_file or audio_file.filename == '':
	return jsonify({'error': 'Invalid audio file'}), 400

	allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
	if audio_file.mimetype not in allowed_mime_types:
	return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400

	model = genai.GenerativeModel("gemini-2.0-flash")
	audio_blob = {'mime_type': audio_file.mimetype, 'data': audio_file.read()}

	convo = model.start_chat()
	convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
	response = convo.send_message(audio_blob)
	transcription = response.text.strip()

	prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
	response = model.generate_content(prompt)
	translated_text = response.text.strip()

	voice_name = GEMINI_VOICES.get(get_bcp47_code(target_language), None)

	if voice_name:
	response = client.models.generate_content(
	model="gemini-2.5-flash-preview-tts",
	contents=translated_text,
	config=types.GenerateContentConfig(
	response_modalities=["AUDIO"],
	speech_config=types.SpeechConfig(
	voice_config=types.VoiceConfig(
	prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
	)
	)
	)
	)
	data = response.candidates[0].content.parts[0].inline_data.data
	temp_path = os.path.join(tempfile.gettempdir(), f'tts_{secure_filename(audio_file.filename)}.wav')
	wave_file(temp_path, data)
	elif target_language in KOKORO_LANGUAGES:
	lang_code = KOKORO_LANGUAGES[target_language]
	pipeline = KPipeline(lang_code=lang_code)
	generator = pipeline(translated_text, voice="af_heart", speed=1)

	audio_segments = []
	for _, _, audio in generator:
	if audio is not None:
	audio_segments.append(audio)

	if audio_segments:
	audio_data = np.concatenate(audio_segments)
	temp_path = os.path.join(tempfile.gettempdir(), f'kokoro_{secure_filename(audio_file.filename)}.wav')
	sf.write(temp_path, audio_data, 24000)
	else:
	raise ValueError("No audio generated by Kokoro")
	else:
	lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
	tts = gTTS(translated_text, lang=lang_code)
	temp_path = os.path.join(tempfile.gettempdir(), f'gtts_{secure_filename(audio_file.filename)}.mp3')
	tts.save(temp_path)

	return jsonify({
	'transcription': transcription,
	'translation': translated_text,
	'audio_url': f'/download/{os.path.basename(temp_path)}'
	})

	except Exception as e:
	app.logger.error(f"Error processing request: {str(e)}")
	return jsonify({'error': str(e)}), 500


	@app.route('/download/<filename>')
	def download_file(filename):
	try:
	return send_file(
	os.path.join(tempfile.gettempdir(), filename),
	mimetype="audio/mpeg",
	as_attachment=True,
	download_name=f"translated_{filename}"
	)
	except FileNotFoundError:
	return jsonify({'error': 'File not found'}), 404


	def get_bcp47_code(language):
	bcp_map = {
	"Arabic": "ar-EG", "German": "de-DE", "English": "en-US", "Spanish": "es-US", "French": "fr-FR",
	"Hindi": "hi-IN", "Indonesian": "id-ID", "Italian": "it-IT", "Japanese": "ja-JP", "Korean": "ko-KR",
	"Portuguese": "pt-BR", "Russian": "ru-RU", "Dutch": "nl-NL", "Polish": "pl-PL", "Thai": "th-TH",
	"Turkish": "tr-TR", "Vietnamese": "vi-VN", "Romanian": "ro-RO", "Ukrainian": "uk-UA", "Bengali": "bn-BD",
	"Indian English": "en-IN", "Marathi": "mr-IN", "Tamil": "ta-IN", "Telugu": "te-IN"
	}
	return bcp_map.get(language)


	if __name__ == '__main__':
	app.run(host="0.0.0.0", port=7860)