Spaces:

SolarumAsteridion
/

Gemini-Audio

Sleeping

App Files Files Community

SolarumAsteridion commited on Jul 30

Commit

40b0211

verified ·

1 Parent(s): 9668073

Create app.py

Browse files

Files changed (1) hide show

app.py +185 -0

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# To run this code you need to install the following dependencies:
+# pip install flask google-genai
+import base64
+import mimetypes
+import os
+import re
+import struct
+import tempfile
+from datetime import datetime
+from flask import Flask, render_template, request, jsonify, send_file
+from google import genai
+from google.genai import types
+import io
+app = Flask(__name__)
+# Store the latest generated audio in memory
+latest_audio = None
+def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
+    """Generates a WAV file header for the given audio data and parameters."""
+    parameters = parse_audio_mime_type(mime_type)
+    bits_per_sample = parameters["bits_per_sample"]
+    sample_rate = parameters["rate"]
+    num_channels = 1
+    data_size = len(audio_data)
+    bytes_per_sample = bits_per_sample // 8
+    block_align = num_channels * bytes_per_sample
+    byte_rate = sample_rate * block_align
+    chunk_size = 36 + data_size
+    header = struct.pack(
+        "<4sI4s4sIHHIIHH4sI",
+        b"RIFF",
+        chunk_size,
+        b"WAVE",
+        b"fmt ",
+        16,
+        1,
+        num_channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b"data",
+        data_size
+    )
+    return header + audio_data
+def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
+    """Parses bits per sample and rate from an audio MIME type string."""
+    bits_per_sample = 16
+    rate = 24000
+    parts = mime_type.split(";")
+    for param in parts:
+        param = param.strip()
+        if param.lower().startswith("rate="):
+            try:
+                rate_str = param.split("=", 1)[1]
+                rate = int(rate_str)
+            except (ValueError, IndexError):
+                pass
+        elif param.startswith("audio/L"):
+            try:
+                bits_per_sample = int(param.split("L", 1)[1])
+            except (ValueError, IndexError):
+                pass
+    return {"bits_per_sample": bits_per_sample, "rate": rate}
+def generate_audio(text, voice="Zephyr", accent_type="hindi"):
+    """Generate audio from text using Gemini TTS"""
+    global latest_audio
+    client = genai.Client(
+        api_key=os.environ.get("GEMINI_API_KEY"),
+    )
+    model = "gemini-2.5-flash-preview-tts"
+    # Different accent prompts
+    accent_prompts = {
+        "hindi": "Speak with a clear Indian Hindi accent, with moderate intonation and expressiveness.",
+        "neutral": "Speak in a clear, neutral accent:",
+        "british": "Speak with a British English accent:",
+        "american": "Speak with an American English accent:"
+    }
+    prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}"
+    contents = [
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text=prompt_text),
+            ],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        temperature=0.5,
+        seed=42,
+        response_modalities=["audio"],
+        speech_config=types.SpeechConfig(
+            voice_config=types.VoiceConfig(
+                prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                    voice_name=voice
+                )
+            )
+        ),
+    )
+    audio_data = None
+    mime_type = None
+    for chunk in client.models.generate_content_stream(
+        model=model,
+        contents=contents,
+        config=generate_content_config,
+    ):
+        if (
+            chunk.candidates is None
+            or chunk.candidates[0].content is None
+            or chunk.candidates[0].content.parts is None
+        ):
+            continue
+        if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
+            inline_data = chunk.candidates[0].content.parts[0].inline_data
+            audio_data = inline_data.data
+            mime_type = inline_data.mime_type
+            break
+    if audio_data:
+        # Convert to WAV format
+        wav_data = convert_to_wav(audio_data, mime_type)
+        latest_audio = wav_data
+        return True
+    return False
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/generate', methods=['POST'])
+def generate():
+    try:
+        data = request.json
+        text = data.get('text', '')
+        voice = data.get('voice', 'Zephyr')
+        accent = data.get('accent', 'hindi')
+        if not text:
+            return jsonify({'error': 'Text is required'}), 400
+        # Generate audio
+        success = generate_audio(text, voice, accent)
+        if success and latest_audio:
+            # Convert to base64 for sending to frontend
+            audio_base64 = base64.b64encode(latest_audio).decode('utf-8')
+            return jsonify({
+                'success': True,
+                'audio': audio_base64
+            })
+        else:
+            return jsonify({'error': 'Failed to generate audio'}), 500
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/download')
+def download():
+    if latest_audio:
+        return send_file(
+            io.BytesIO(latest_audio),
+            mimetype='audio/wav',
+            as_attachment=True,
+            download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav'
+        )
+    return jsonify({'error': 'No audio available'}), 404
+if __name__ == '__main__':
+    app.run(debug=True, port=5000)