Spaces:

Athspi-aitools
/

Aittsg

Running

App Files Files Community

Athspi commited on Apr 8

Commit

dc46bfc

verified ·

1 Parent(s): c8ead4f

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -167

app.py CHANGED Viewed

@@ -1,189 +1,111 @@
-import os
-import uuid
-import time
-from pathlib import Path
 import io
 import logging
-import torch
-from transformers import pipeline
-import soundfile as sf
-import numpy as np
-from fastapi import FastAPI, HTTPException, Body, BackgroundTasks
-from fastapi.responses import StreamingResponse # To send binary audio data
-from pydantic import BaseModel
 # --- Configuration ---
-# Choose a TTS model from the Hugging Face Hub
-MODEL_NAME = "espnet/kan-bayashi_ljspeech_vits" # Example model
-# MODEL_NAME = "suno/bark-small"
-# Directories
-BASE_DIR = Path(__file__).parent
-TEMP_AUDIO_DIR = BASE_DIR / "temp_audio" # For temporary storage before sending
-# Ensure temporary audio directory exists
-TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
-# Configure Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Pydantic Model for Request Body ---
 class TTSRequest(BaseModel):
-    text: str
-# --- Load TTS Model (Load on startup) ---
-logger.info("Attempting to load TTS model...")
-start_load_time = time.time()
-tts_pipeline = None
-try:
-    # Use GPU if available
-    if torch.cuda.is_available():
-        device = "cuda"
-    # Check for MPS (Apple Silicon) support if not CUDA
-    elif torch.backends.mps.is_available():
-        device = "mps"
-    else:
-        device = "cpu"
-    logger.info(f"Using device: {device}")
-    tts_pipeline = pipeline("text-to-speech", model=MODEL_NAME, device=device)
-    logger.info(f"Model '{MODEL_NAME}' loaded successfully in {time.time() - start_load_time:.2f} seconds.")
-except Exception as e:
-    logger.error(f"FATAL: Could not load TTS model '{MODEL_NAME}'. Error: {e}", exc_info=True)
-    # The application can still run, but the /api/tts endpoint will fail until the model is loaded/fixed.
 # --- Initialize FastAPI App ---
 app = FastAPI(
-    title="Text-to-Speech API Service",
-    description=f"Provides a text-to-speech endpoint using the {MODEL_NAME} model. Send text, receive WAV audio.",
-    version="1.0.0"
 )
-# --- Background Task for Cleanup ---
-def cleanup_temp_file(filepath: Path):
-    """Removes a file in the background."""
-    try:
-        if filepath.exists():
-            os.remove(filepath)
-            logger.info(f"Cleaned up temp file: {filepath.name}")
-    except OSError as e:
-        logger.error(f"Error deleting temp file {filepath.name}: {e}")
 # --- API Endpoint for Text-to-Speech ---
 @app.post(
     "/api/tts",
     tags=["TTS"],
-    summary="Generate Speech from Text",
-    description="""Send a JSON object with a "text" field.
-    Returns the generated speech as a WAV audio file stream.""",
     responses={
         200: {
-            "content": {"audio/wav": {}},
-            "description": "Successful response returning the WAV audio stream.",
         },
-        400: {"description": "Bad Request (e.g., empty text)"},
-        500: {"description": "Internal Server Error (e.g., model error)"},
-        503: {"description": "Service Unavailable (e.g., model not loaded)"},
     },
 )
-async def generate_speech_api(
-    background_tasks: BackgroundTasks,
     tts_request: TTSRequest = Body(...)
 ):
     """
-    Receives text via POST request and returns the generated WAV audio directly.
     """
-    if tts_pipeline is None:
-        raise HTTPException(status_code=503, detail="TTS Model is not available or failed to load.")
     text = tts_request.text
     if not text or not text.strip():
         raise HTTPException(status_code=400, detail="Input text cannot be empty.")
-    logger.info(f"Received API request to synthesize: '{text[:50]}...'") # Log truncated text
     start_synth_time = time.time()
     try:
-        # --- Generate Audio ---
-        with torch.no_grad(): # Good practice for inference
-             output = tts_pipeline(text)
-        audio_data = output.get("audio")
-        sampling_rate = output.get("sampling_rate")
-        if audio_data is None or sampling_rate is None:
-             logger.error("TTS pipeline output missing 'audio' or 'sampling_rate'.")
-             raise ValueError("Invalid output from TTS pipeline.")
-        # Ensure NumPy array
-        if isinstance(audio_data, torch.Tensor):
-            # Ensure it's on CPU before converting to numpy
-            audio_data = audio_data.cpu().numpy()
-        if not isinstance(audio_data, np.ndarray):
-             logger.error(f"Unexpected audio data type: {type(audio_data)}")
-             raise TypeError(f"Expected audio data as NumPy array, got {type(audio_data)}")
-        # Normalize if float and outside [-1, 1] range (important for WAV)
-        if np.issubdtype(audio_data.dtype, np.floating):
-             max_val = np.max(np.abs(audio_data))
-             if max_val > 1.0:
-                 audio_data = audio_data / max_val
-             # Convert to 16-bit integer format for standard WAV
-             audio_data = (audio_data * 32767).astype(np.int16)
-        elif not np.issubdtype(audio_data.dtype, np.integer):
-             logger.warning(f"Audio data is not float or int: {audio_data.dtype}. Attempting conversion to int16.")
-             # Attempt conversion if possible, might need adjustment based on model output
-             audio_data = audio_data.astype(np.int16)
         synthesis_time = time.time() - start_synth_time
-        logger.info(f"Audio generated in {synthesis_time:.2f} seconds.")
-        # --- Prepare Audio for Streaming ---
-        # Method 1: Save to temp file and stream it (often safer for large files)
-        filename = f"speech_{uuid.uuid4()}.wav"
-        filepath = TEMP_AUDIO_DIR / filename
-        sf.write(filepath, audio_data, sampling_rate, subtype='PCM_16') # Save as standard 16-bit WAV
-        logger.info(f"Temporary audio saved to: {filepath.name}")
-        # Schedule the cleanup task to run after the response is sent
-        background_tasks.add_task(cleanup_temp_file, filepath)
-        # Return the file directly as a streaming response
-        return FileResponse(
-            path=filepath,
-            media_type="audio/wav",
-            filename=filename # Suggests a filename to the client
         )
-        # # Method 2: Stream directly from memory buffer (avoids disk I/O)
-        # buffer = io.BytesIO()
-        # sf.write(buffer, audio_data, sampling_rate, format='WAV', subtype='PCM_16')
-        # buffer.seek(0) # Reset buffer position to the beginning
-        # logger.info("Audio prepared in memory buffer.")
-        # return StreamingResponse(buffer, media_type="audio/wav")
     except Exception as e:
-        logger.error(f"Error during speech generation or streaming: {e}", exc_info=True)
-        # Cleanup temp file if it was created before an error occurred during streaming prep
-        if 'filepath' in locals() and filepath.exists():
-             logger.info(f"Cleaning up temp file due to error: {filepath.name}")
-             os.remove(filepath)
-        raise HTTPException(status_code=500, detail=f"Failed to process speech request. Error: {str(e)}")
 # --- Health Check Endpoint (Good Practice) ---
 @app.get("/health", tags=["System"], summary="Check API Health")
 async def health_check():
     """
-    Simple health check endpoint. Checks if the TTS model is loaded.
     """
-    if tts_pipeline is None:
-        return {"status": "unhealthy", "reason": "TTS model is not loaded or failed to load."}
-    # Can add more checks here (e.g., disk space, dependencies)
-    return {"status": "ok", "model_loaded": MODEL_NAME}
 # --- Root Endpoint (Optional Information) ---
 @app.get("/", tags=["System"], summary="API Information")
@@ -192,41 +114,16 @@ async def read_root():
     Provides basic information about the API.
     """
     return {
-        "message": "Welcome to the Text-to-Speech API Service!",
-        "model_used": MODEL_NAME,
         "tts_endpoint": "/api/tts",
         "health_endpoint": "/health",
         "documentation": "/docs" # Link to FastAPI auto-generated docs
     }
-# --- Optional: Add cleanup for *old* files on startup (if using FileResponse) ---
-def cleanup_old_audio_files(max_age_seconds: int = 3600): # Clean files older than 1 hour
-    now = time.time()
-    count = 0
-    try:
-        for filename in os.listdir(TEMP_AUDIO_DIR):
-            filepath = TEMP_AUDIO_DIR / filename
-            if filepath.is_file() and filename.startswith("speech_") and filename.endswith(".wav"):
-                try:
-                    file_mod_time = os.path.getmtime(filepath)
-                    if (now - file_mod_time) > max_age_seconds:
-                        os.remove(filepath)
-                        logger.info(f"Startup cleanup: Removed old temp file {filename}")
-                        count += 1
-                except OSError as e:
-                    logger.warning(f"Startup cleanup: Error removing file {filename}: {e}")
-        if count > 0:
-            logger.info(f"Startup cleanup: Removed {count} old audio files.")
-    except Exception as e:
-        logger.error(f"Startup cleanup: Error during old file cleanup: {e}")
-# Run cleanup on startup
-cleanup_old_audio_files()
 # --- How to Run Locally (for testing) ---
 # if __name__ == "__main__":
 #     import uvicorn
-#     # Ensure temp_audio exists before starting
-#     TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
-#     cleanup_old_audio_files() # Run cleanup before starting server
-#     uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True) # Use reload=False for production testing

 import io
 import logging
+import time
+from fastapi import FastAPI, HTTPException, Body, Response
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field # Field for adding validation/defaults
+from gtts import gTTS, gTTSError
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Pydantic Model for Request Body ---
 class TTSRequest(BaseModel):
+    text: str = Field(..., min_length=1, description="The text to be converted to speech.")
+    lang: str = Field("en", description="Language code for the speech (e.g., 'en', 'es', 'fr'). See gTTS documentation for supported languages.")
+    # Optional: Add tld if you need specific accents tied to Google domains
+    # tld: str = Field("com", description="Top-level domain for Google TTS endpoint (e.g., 'com', 'co.uk', 'com.au')")
 # --- Initialize FastAPI App ---
 app = FastAPI(
+    title="gTTS API Service",
+    description="A simple API service that uses gTTS (Google Text-to-Speech) to convert text into speech (MP3 audio).",
+    version="1.0.0",
 )
 # --- API Endpoint for Text-to-Speech ---
 @app.post(
     "/api/tts",
     tags=["TTS"],
+    summary="Generate Speech using gTTS",
+    description="""Send a JSON object with 'text' and optionally 'lang' fields.
+    Returns the generated speech as an MP3 audio stream.""",
     responses={
         200: {
+            "content": {"audio/mpeg": {}}, # MP3 content type
+            "description": "Successful response returning the MP3 audio stream.",
         },
+        400: {"description": "Bad Request (e.g., empty text, invalid language)"},
+        500: {"description": "Internal Server Error (e.g., gTTS failed)"},
     },
 )
+async def generate_speech_gtts_api(
     tts_request: TTSRequest = Body(...)
 ):
     """
+    Receives text and language via POST request, uses gTTS to generate
+    speech, and returns the MP3 audio directly as a stream.
     """
     text = tts_request.text
+    lang = tts_request.lang
+    # tld = tts_request.tld # Uncomment if using tld
     if not text or not text.strip():
+        # The pydantic model validation (min_length=1) should catch this,
+        # but belt-and-suspenders approach is fine.
         raise HTTPException(status_code=400, detail="Input text cannot be empty.")
+    logger.info(f"Received gTTS request: lang='{lang}', text='{text[:50]}...'")
     start_synth_time = time.time()
     try:
+        # --- Generate Audio using gTTS ---
+        # Create gTTS object
+        tts = gTTS(text=text, lang=lang, slow=False) # Add tld=tld if using
+        # --- Prepare Audio for Streaming ---
+        # Use an in-memory buffer (avoids temporary files)
+        mp3_fp = io.BytesIO()
+        tts.write_to_fp(mp3_fp)
+        mp3_fp.seek(0) # Rewind the buffer to the beginning for reading
         synthesis_time = time.time() - start_synth_time
+        logger.info(f"gTTS audio generated in {synthesis_time:.2f} seconds.")
+        # --- Return Streaming Response ---
+        return StreamingResponse(
+            mp3_fp,
+            media_type="audio/mpeg", # Standard MIME type for MP3
+            headers={'Content-Disposition': 'attachment; filename="speech.mp3"'} # Suggest filename
         )
+    except gTTSError as e:
+        logger.error(f"gTTS Error: {e}", exc_info=True)
+        # Check for common errors like invalid language
+        if "Language not supported" in str(e):
+             raise HTTPException(status_code=400, detail=f"Language '{lang}' not supported by gTTS. Error: {e}")
+        else:
+             raise HTTPException(status_code=500, detail=f"gTTS failed to generate speech. Error: {e}")
     except Exception as e:
+        logger.error(f"An unexpected error occurred during speech generation: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred. Error: {str(e)}")
 # --- Health Check Endpoint (Good Practice) ---
 @app.get("/health", tags=["System"], summary="Check API Health")
 async def health_check():
     """
+    Simple health check endpoint. Returns status ok if the service is running.
     """
+    # Can add a quick gTTS test here if needed, but might slow down health check
+    # try:
+    #     gTTS(text='test', lang='en').save('test.mp3') # Dummy generation
+    #     os.remove('test.mp3')
+    # except Exception as e:
+    #     return {"status": "unhealthy", "reason": f"gTTS basic test failed: {e}"}
+    return {"status": "ok"}
 # --- Root Endpoint (Optional Information) ---
 @app.get("/", tags=["System"], summary="API Information")
     Provides basic information about the API.
     """
     return {
+        "message": "Welcome to the gTTS API Service!",
+        "tts_engine": "gTTS (Google Text-to-Speech)",
         "tts_endpoint": "/api/tts",
         "health_endpoint": "/health",
+        "expected_request_body": {"text": "string", "lang": "string (optional, default 'en')"},
+        "response_content_type": "audio/mpeg",
         "documentation": "/docs" # Link to FastAPI auto-generated docs
     }
 # --- How to Run Locally (for testing) ---
 # if __name__ == "__main__":
 #     import uvicorn
+#     uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)