Spaces:

Athspi-aitools
/

Aittsg

Running

App Files Files Community

Athspi commited on 19 days ago

Commit

515f8f3

verified ·

1 Parent(s): 25a22ef

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -171

app.py CHANGED Viewed

@@ -1,196 +1,133 @@
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import FileResponse
 from google import genai
 from google.genai import types
-import os
 import wave
-import tempfile
-import gradio as gr
-import uvicorn
 from typing import Optional
 from pydantic import BaseModel
-# ─── Configuration ────────────────────────────────────────────────────────────
-class Config:
-    API_KEYS = [
-        os.getenv("GOOGLE_API_KEY_1"),
-        os.getenv("GOOGLE_API_KEY_2")
-    ]
-    MODEL_CHOICES = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
-    VOICE_NAMES = [
-        "Achernar", "Achird", "Algenib", "Algieba", "Alnilam", "Aoede", "Autonoe",
-        "Callirrhoe", "Charon", "Despina", "Enceladus", "Erinome", "Fenrir", "Gacrux",
-        "Iapetus", "Kore", "Laomedeia", "Leda", "Orus", "Puck", "Pulcherrima", "Rasalgethi",
-        "Sadachbia", "Sadaltager", "Schedar", "Sulafat", "Umbriel", "Vindemiatrix", "Zephyr", "Zubenelgenubi"
-    ]
-# ─── API Models ───────────────────────────────────────────────────────────────
 class TTSRequest(BaseModel):
     text: str
-    model: str = Config.MODEL_CHOICES[0]
-    voice_name: str = Config.VOICE_NAMES[0]
-    api_key: Optional[str] = None
-# ─── Core TTS Engine ──────────────────────────────────────────────────────────
-class TTSEngine:
-    def __init__(self):
-        self.clients = {}
-        self.init_clients()
-    def init_clients(self):
-        for idx, key in enumerate(Config.API_KEYS):
-            if key:
-                try:
-                    self.clients[f"client_{idx}"] = genai.Client(api_key=key)
-                except Exception as e:
-                    print(f"Failed to initialize client with key {idx}: {str(e)}")
-    def generate_audio(self, request: TTSRequest) -> str:
-        client = None
-        used_key = request.api_key
-        # Try user-provided key first
-        if used_key:
-            try:
-                client = genai.Client(api_key=used_key)
-            except Exception:
-                pass
-        # Fallback to pre-configured clients
-        if not client:
-            for client_name, existing_client in self.clients.items():
-                try:
-                    response = existing_client.models.generate_content(
-                        model=request.model,
-                        contents=request.text,
-                        config=types.GenerateContentConfig(
-                            response_modalities=["AUDIO"],
-                            speech_config=types.SpeechConfig(
-                                voice_config=types.VoiceConfig(
-                                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                                        voice_name=request.voice_name
-                                    )
-                                )
-                            ),
-                        ),
-                    )
-                    client = existing_client
-                    used_key = client_name
-                    break
-                except Exception:
-                    continue
-        if not client:
-            raise HTTPException(status_code=500, detail="No working API keys available")
-        try:
-            response = client.models.generate_content(
-                model=request.model,
-                contents=request.text,
-                config=types.GenerateContentConfig(
-                    response_modalities=["AUDIO"],
-                    speech_config=types.SpeechConfig(
-                        voice_config=types.VoiceConfig(
-                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                                voice_name=request.voice_name
-                            )
                         )
-                    ),
                 ),
             )
-            if response.candidates and response.candidates[0].content.parts:
-                pcm_data = response.candidates[0].content.parts[0].inline_data.data
-                return self.create_wave_file(pcm_data)
-            raise ValueError("Invalid response format from API")
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=str(e))
-    @staticmethod
-    def create_wave_file(pcm_bytes: bytes) -> str:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            with wave.open(tmp_file.name, "wb") as wf:
-                wf.setnchannels(1)
-                wf.setsampwidth(2)
-                wf.setframerate(24000)
-                wf.writeframes(pcm_bytes)
-            return tmp_file.name
-# ─── FastAPI App ──────────────────────────────────────────────────────────────
-app = FastAPI(title="Gemini TTS API", version="1.0")
-engine = TTSEngine()
-@app.post("/generate/")
-async def generate_tts(request: TTSRequest):
-    try:
-        audio_path = engine.generate_audio(request)
-        return FileResponse(audio_path, media_type="audio/wav")
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# ─── Gradio Interface ─────────────────────────────────────────────────────────
-def create_gradio_interface():
-    with gr.Blocks(title="Gemini TTS", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("## 🎤 Gemini TTS Voice Generator")
-        with gr.Row():
-            with gr.Column():
-                model_choice = gr.Dropdown(
-                    choices=Config.MODEL_CHOICES,
-                    value=Config.MODEL_CHOICES[0],
-                    label="Model"
-                )
-                voice_name = gr.Dropdown(
-                    choices=Config.VOICE_NAMES,
-                    value=Config.VOICE_NAMES[0],
-                    label="Voice"
-                )
-                api_key = gr.Textbox(
-                    label="API Key (optional)",
-                    type="password",
-                    placeholder="Leave empty to use configured keys"
-                )
-            with gr.Column():
-                text_input = gr.Textbox(
-                    label="Text to Convert",
-                    lines=5,
-                    placeholder="Enter text here..."
-                )
-                generate_btn = gr.Button("Generate Speech")
-                audio_output = gr.Audio(label="Generated Audio")
-                error_output = gr.Textbox(label="Error", visible=False)
-        def generate(text, model, voice, key):
-            try:
-                request = TTSRequest(text=text, model=model, voice_name=voice, api_key=key or None)
-                audio_path = engine.generate_audio(request)
-                return audio_path, ""
-            except Exception as e:
-                return None, str(e)
-        generate_btn.click(
-            generate,
-            inputs=[text_input, model_choice, voice_name, api_key],
-            outputs=[audio_output, error_output]
         )
-    return interface
-# ─── Deployment Options ───────────────────────────────────────────────────────
-gradio_app = create_gradio_interface()
-# For Hugging Face Spaces
-app = gr.mount_gradio_app(app, gradio_app, path="/")
-# For local development
 if __name__ == "__main__":
-    uvicorn.run(
-        "main:app",
-        host="0.0.0.0",
-        port=8000,
-        reload=True,
-        workers=2
-    )

+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
 from google import genai
 from google.genai import types
 import wave
+import os
+import uuid
 from typing import Optional
 from pydantic import BaseModel
+from pathlib import Path
+app = FastAPI(
+    title="Google GenAI TTS API",
+    description="API for text-to-speech conversion using Google GenAI",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url=None
+)
+# Configuration
+AUDIO_OUTPUT_DIR = "audio_output"
+Path(AUDIO_OUTPUT_DIR).mkdir(exist_ok=True)
+# Mount static files for Hugging Face Spaces
+app.mount("/static", StaticFiles(directory="static"), name="static")
 class TTSRequest(BaseModel):
     text: str
+    voice_name: Optional[str] = "Kore"
+    cheerful: Optional[bool] = True
+    sample_rate: Optional[int] = 24000
+    channels: Optional[int] = 1
+    sample_width: Optional[int] = 2
+def initialize_genai_client():
+    """Initialize the GenAI client with API key from environment"""
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("GEMINI_API_KEY environment variable not set")
+    return genai.Client(api_key=api_key)
+def generate_wave_file(filename: str, pcm_data: bytes, channels: int, rate: int, sample_width: int):
+    """Generate a WAV file from PCM data"""
+    with wave.open(filename, "wb") as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(sample_width)
+        wf.setframerate(rate)
+        wf.writeframes(pcm_data)
+@app.post("/api/generate-tts/")
+async def generate_tts(request: TTSRequest):
+    """
+    Generate speech from text using Google GenAI TTS
+    Parameters:
+    - text: The text to convert to speech
+    - voice_name: Voice to use (default: 'Kore')
+    - cheerful: Whether to speak cheerfully (default: True)
+    - sample_rate: Audio sample rate (default: 24000)
+    - channels: Number of audio channels (default: 1)
+    - sample_width: Sample width in bytes (default: 2)
+    Returns:
+    - JSON with file URL or error message
+    """
+    try:
+        client = initialize_genai_client()
+        text_to_speak = f"Say cheerfully: {request.text}" if request.cheerful else request.text
+        response = client.models.generate_content(
+            model="gemini-2.5-flash-preview-tts",
+            contents=text_to_speak,
+            config=types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                            voice_name=request.voice_name,
                         )
+                    )
                 ),
             )
+        )
+        if not response.candidates or not response.candidates[0].content.parts:
+            raise HTTPException(status_code=500, detail="No audio data received from GenAI")
+        audio_data = response.candidates[0].content.parts[0].inline_data.data
+        file_name = f"tts_{uuid.uuid4().hex}.wav"
+        file_path = os.path.join(AUDIO_OUTPUT_DIR, file_name)
+        generate_wave_file(
+            file_path,
+            audio_data,
+            channels=request.channels,
+            rate=request.sample_rate,
+            sample_width=request.sample_width
+        )
+        # For Hugging Face Spaces, we need to return the URL where the file can be accessed
+        file_url = f"/static/{file_name}"
+        os.rename(file_path, f"static/{file_name}")
+        return JSONResponse({
+            "status": "success",
+            "audio_url": file_url,
+            "filename": file_name
+        })
+    except Exception as e:
+        return JSONResponse(
+            {"status": "error", "message": str(e)},
+            status_code=500
         )
+@app.get("/")
+async def root():
+    return {"message": "Google GenAI TTS API is running"}
+# Error handler
+@app.exception_handler(Exception)
+async def generic_exception_handler(request: Request, exc: Exception):
+    return JSONResponse(
+        status_code=500,
+        content={"message": f"An error occurred: {str(exc)}"}
+    )
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8080)