Audio-translation

Running

App Files Files Community

Audio-translation / app.py

Athspi

Update app.py

132c026 verified 3 months ago

raw

history blame

7.8 kB

	import os
	# import numpy as np # No longer needed for TTS
	from flask import Flask, request, jsonify, send_file, send_from_directory
	import google.generativeai as genai
	from google.generativeai import types as genai_types # For clarity if needed, or use genai.types
	# from gtts import gTTS, lang # Removed
	import tempfile
	# import soundfile as sf # Removed, using wave module instead
	# from kokoro import KPipeline # Removed
	from werkzeug.utils import secure_filename
	from flask_cors import CORS
	import wave # Added for saving WAV files

	app = Flask(__name__, static_folder='static')
	CORS(app)

	# Configure Gemini API
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if not GEMINI_API_KEY:
	raise ValueError("GEMINI_API_KEY environment variable not set")
	genai.configure(api_key=GEMINI_API_KEY)

	# Transcription and Translation Model
	TRANSCRIPTION_TRANSLATION_MODEL_NAME = "gemini-2.0-flash" # Using 1.5 flash as it's common, was "gemini-2.0-flash"
	# Text-to-Speech Model
	TTS_MODEL_NAME = "gemini-2.5-flash-preview-tts" # Using a model known to support audio output modality.
	# The user's example mentioned "gemini-2.5-flash-preview-tts".
	# If that specific model works with response_mime_type, it can be used.

	# Gemini TTS Supported Languages (Display Name: BCP-47 Code)
	# Based on the user-provided list. The TTS API auto-detects language from text.
	# This list is primarily for the frontend language selector.
	GEMINI_TTS_LANGUAGES = {
	"Arabic (Egyptian)": "ar-EG",
	"German (Germany)": "de-DE",
	"English (US)": "en-US",
	"Spanish (US)": "es-US",
	"French (France)": "fr-FR",
	"Hindi (India)": "hi-IN",
	"Indonesian (Indonesia)": "id-ID",
	"Italian (Italy)": "it-IT",
	"Japanese (Japan)": "ja-JP",
	"Korean (Korea)": "ko-KR",
	"Portuguese (Brazil)": "pt-BR",
	"Russian (Russia)": "ru-RU",
	"Dutch (Netherlands)": "nl-NL",
	"Polish (Poland)": "pl-PL",
	"Thai (Thailand)": "th-TH",
	"Turkish (Turkey)": "tr-TR",
	"Vietnamese (Vietnam)": "vi-VN",
	"Romanian (Romania)": "ro-RO",
	"Ukrainian (Ukraine)": "uk-UA",
	"Bengali (Bangladesh)": "bn-BD",
	"English (India)": "en-IN",
	"Marathi (India)": "mr-IN",
	"Tamil (India)": "ta-IN",
	"Telugu (India)": "te-IN"
	}

	SUPPORTED_LANGUAGES = sorted(list(GEMINI_TTS_LANGUAGES.keys()))

	# Helper function to save PCM data as a WAV file
	def save_wave_file(filename, pcm_data, channels=1, sample_width=2, frame_rate=24000):
	"""Saves PCM audio data to a WAV file."""
	with wave.open(filename, "wb") as wf:
	wf.setnchannels(channels)
	wf.setsampwidth(sample_width) # Bytes per sample
	wf.setframerate(frame_rate)
	wf.writeframes(pcm_data)

	@app.route('/')
	def serve_index():
	return send_from_directory(app.static_folder, 'index.html')

	@app.route('/languages')
	def get_languages():
	return jsonify(SUPPORTED_LANGUAGES)

	@app.route('/translate', methods=['POST'])
	def translate_audio():
	try:
	if 'audio' not in request.files:
	return jsonify({'error': 'No audio file uploaded'}), 400

	audio_file = request.files['audio']
	target_language_display_name = request.form.get('language', 'English (US)') # Default to a common one

	if not audio_file or audio_file.filename == '':
	return jsonify({'error': 'Invalid audio file'}), 400

	# Validate MIME type for transcription
	allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp3', 'audio/ogg', 'audio/flac', 'audio/mp4', 'audio/webm', 'audio/amr']
	if audio_file.mimetype not in allowed_mime_types:
	return jsonify({'error': f'Unsupported file type for transcription: {audio_file.mimetype}'}), 400

	# Initialize Gemini model for transcription and translation
	model = genai.GenerativeModel(TRANSCRIPTION_TRANSLATION_MODEL_NAME)

	audio_data_bytes = audio_file.read()
	audio_blob = genai_types.Blob(mime_type=audio_file.mimetype, data=audio_data_bytes)

	# Get transcription
	# Forcing transcription to be in original language can be tricky if the model tends to translate.
	# A more robust prompt might be needed if issues arise.
	transcription_prompt = "You are a professional transcriber. Transcribe this audio accurately and verbatim in its original spoken language. Respond only with the transcription."

	# Using genai.upload_file for larger files if needed, but for direct blob:
	response = model.generate_content([transcription_prompt, audio_blob])
	transcription = response.text.strip()

	# Translate text using Gemini
	translation_prompt = f"Translate the following text to {target_language_display_name}. Preserve meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
	response = model.generate_content(translation_prompt)
	translated_text = response.text.strip()

	# Generate TTS using Gemini
	tts_model = genai.GenerativeModel(TTS_MODEL_NAME)

	# Gemini TTS detects language from the text.
	# The voice selection is typically handled by the model or default voice for the detected language.
	# The user's snippet for `speech_config` and `voice_name='Kore'` is not directly compatible
	# with the current `google-generativeai` SDK's `GenerativeModel.generate_content` method
	# in a straightforward way. This method uses `response_mime_type` for audio output.

	tts_generation_config = genai_types.GenerationConfig(
	response_mime_type="audio/wav" # Gemini will output WAV audio
	)

	# The content for TTS is just the translated text.
	tts_response = tts_model.generate_content(
	contents=[translated_text], # Make sure contents is an iterable of Parts or strings
	generation_config=tts_generation_config
	)

	if not (tts_response.candidates and tts_response.candidates[0].content.parts):
	raise ValueError("Gemini TTS did not return audio data.")

	audio_pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data

	_, temp_output_path = tempfile.mkstemp(suffix=".wav")
	# Default parameters from the user's example: rate=24000, sample_width=2 (16-bit), channels=1
	save_wave_file(temp_output_path, audio_pcm_data, channels=1, sample_width=2, frame_rate=24000)

	return jsonify({
	'transcription': transcription,
	'translation': translated_text,
	'audio_url': f'/download/{os.path.basename(temp_output_path)}'
	})

	except Exception as e:
	app.logger.error(f"Error processing request: {str(e)}", exc_info=True)
	return jsonify({'error': str(e)}), 500

	@app.route('/download/<filename>')
	def download_file(filename):
	try:
	# tempfile.gettempdir() is the directory where mkstemp creates files
	file_path = os.path.join(tempfile.gettempdir(), filename)
	return send_file(
	file_path,
	mimetype="audio/wav", # Changed from mpeg to wav
	as_attachment=True,
	download_name=f"translated_{filename.replace(tempfile.gettempdir(), '')}" # Cleaner name
	)
	except FileNotFoundError:
	return jsonify({'error': 'File not found'}), 404
	except Exception as e:
	app.logger.error(f"Error downloading file: {str(e)}", exc_info=True)
	return jsonify({'error': f"Error downloading file: {str(e)}"}), 500


	if __name__ == '__main__':
	# Consider adding an environment variable for debug mode for production
	app.run(host="0.0.0.0", port=7860) # Added debug=True for development