import base64 import mimetypes import os import re import struct import tempfile from datetime import datetime from flask import Flask, render_template, request, jsonify, send_file from google import genai from google.genai import types import io app = Flask(__name__) # Store the latest generated audio in memory latest_audio = None def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes: """Generates a WAV file header for the given audio data and parameters.""" parameters = parse_audio_mime_type(mime_type) bits_per_sample = parameters["bits_per_sample"] sample_rate = parameters["rate"] num_channels = 1 data_size = len(audio_data) bytes_per_sample = bits_per_sample // 8 block_align = num_channels * bytes_per_sample byte_rate = sample_rate * block_align chunk_size = 36 + data_size header = struct.pack( "<4sI4s4sIHHIIHH4sI", b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample, b"data", data_size ) return header + audio_data def parse_audio_mime_type(mime_type: str): """Parses bits per sample and rate from an audio MIME type string.""" bits_per_sample = 16 rate = 24000 parts = mime_type.split(";") for param in parts: param = param.strip() if param.lower().startswith("rate="): try: rate_str = param.split("=", 1)[1] rate = int(rate_str) except (ValueError, IndexError): pass elif param.startswith("audio/L"): try: bits_per_sample = int(param.split("L", 1)[1]) except (ValueError, IndexError): pass return {"bits_per_sample": bits_per_sample, "rate": rate} def generate_audio(text, voice="Zephyr", accent_type="hindi"): """Generate audio from text using Gemini TTS""" global latest_audio client = genai.Client( api_key=os.environ.get("GEMINI_API_KEY"), ) model = "gemini-2.5-flash-preview-tts" # Different accent prompts accent_prompts = { "hindi": "Speak with a clear Indian Hindi accent, with low intonation and expressiveness. Do not say it aloud like a story. Be conversational like a customer care agent.", "neutral": "Speak in a clear, neutral accent:", "british": "Speak with a British English accent:", "american": "Speak with an American English accent:" } prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}" contents = [ types.Content( role="user", parts=[ types.Part.from_text(text=prompt_text), ], ), ] generate_content_config = types.GenerateContentConfig( temperature=0.5, seed=42, response_modalities=["audio"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig( voice_name=voice ) ) ), ) audio_data = None mime_type = None for chunk in client.models.generate_content_stream( model=model, contents=contents, config=generate_content_config, ): if ( chunk.candidates is None or chunk.candidates[0].content is None or chunk.candidates[0].content.parts is None ): continue if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data: inline_data = chunk.candidates[0].content.parts[0].inline_data audio_data = inline_data.data mime_type = inline_data.mime_type break if audio_data: # Convert to WAV format wav_data = convert_to_wav(audio_data, mime_type) latest_audio = wav_data return True return False @app.route('/') def index(): return render_template('index.html') @app.route('/generate', methods=['POST']) def generate(): try: data = request.json text = data.get('text', '') voice = data.get('voice', 'Zephyr') accent = data.get('accent', 'hindi') if not text: return jsonify({'error': 'Text is required'}), 400 # Generate audio success = generate_audio(text, voice, accent) if success and latest_audio: # Convert to base64 for sending to frontend audio_base64 = base64.b64encode(latest_audio).decode('utf-8') return jsonify({ 'success': True, 'audio': audio_base64 }) else: return jsonify({'error': 'Failed to generate audio'}), 500 except Exception as e: return jsonify({'error': str(e)}), 500 @app.route('/download') def download(): if latest_audio: return send_file( io.BytesIO(latest_audio), mimetype='audio/wav', as_attachment=True, download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav' ) return jsonify({'error': 'No audio available'}), 404