TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6

Commit

4196bc7

verified ·

1 Parent(s): 64f53a8

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -174

app.py CHANGED Viewed

@@ -94,31 +94,45 @@
 #     return Response("No audio generated", status_code=400)
-from fastapi import FastAPI, Response, HTTPException, Request
-from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles
-from kokoro import KPipeline
 import os
-import numpy as np
-import torch
-from huggingface_hub import InferenceClient
-from pydantic import BaseModel
 import base64
 import logging
 from typing import Optional, ClassVar, List
-import uuid
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class TextImageRequest(BaseModel):
     text: Optional[str] = None
     image_base64: Optional[str] = None
-    voice: str = "af_heart"  # Default voice that we know exists
     speed: float = 1.0
-    # Annotate as a ClassVar so Pydantic ignores it as a field.
     AVAILABLE_VOICES: ClassVar[List[str]] = ["af_heart"]
     def validate_voice(self):
@@ -126,6 +140,7 @@ class TextImageRequest(BaseModel):
             return "af_heart"
         return self.voice
 class AudioResponse(BaseModel):
     status: str
     message: str
@@ -134,107 +149,72 @@ class ErrorResponse(BaseModel):
     error: str
     detail: Optional[str] = None
-# Initialize FastAPI app
-app = FastAPI(
-    title="Text-to-Speech API with Vision Support",
-    description="API for generating speech from text with optional image analysis",
-    version="1.0.0"
-)
-# Create and mount static images directory so images are accessible via URL
-STATIC_DIR = "static_images"
-if not os.path.exists(STATIC_DIR):
-    os.makedirs(STATIC_DIR)
-app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
-def llm_chat_response(text, image_base64=None):
-    """Get responses from LLM with text and optionally an image input."""
     try:
-        HF_TOKEN = os.getenv("HF_TOKEN")
-        logger.info("Checking HF_TOKEN...")
-        if not HF_TOKEN:
-            logger.error("HF_TOKEN not found in environment variables")
-            raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
-        logger.info("Initializing InferenceClient...")
-        client = InferenceClient(
-            provider="hf-inference",  # Using the correct provider as per sample
-            api_key=HF_TOKEN
-        )
-        if image_base64:
-            logger.info("Processing request with image")
-            # Save the base64 image to the static folder
-            filename = f"{uuid.uuid4()}.jpg"
-            image_path = os.path.join(STATIC_DIR, filename)
-            try:
-                image_data = base64.b64decode(image_base64)
-            except Exception as e:
-                logger.error(f"Error decoding base64 image: {str(e)}")
-                raise HTTPException(status_code=400, detail="Invalid base64 image data")
-            with open(image_path, "wb") as f:
-                f.write(image_data)
-            # Construct image URL (assumes BASE_URL environment variable or defaults to localhost)
-            base_url = os.getenv("BASE_URL", "http://localhost:8000")
-            image_url = f"{base_url}/static/{filename}"
-            prompt = text if text else "Describe this image in one sentence."
-            # Construct message exactly as in the reference
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {"type": "image_url", "image_url": {"url": image_url}}
-                    ]
-                }
-            ]
-        else:
-            logger.info("Processing text-only request")
-            messages = [
-                {
-                    "role": "user",
-                    "content": text + " Describe in one line only."
-                }
             ]
-        logger.info("Sending request to model...")
-        logger.info(f"Message structure: {messages}")
         completion = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
             max_tokens=500
         )
-        logger.info("Received response from model")
-        logger.info(f"Model response received: {completion}")
-        try:
-            response = completion.choices[0].message.content
-            logger.info(f"Extracted response content: {response}")
-            return response
-        except Exception as e:
-            logger.error(f"Error extracting message content: {str(e)}")
-            try:
-                if hasattr(completion.choices[0], "message") and hasattr(completion.choices[0].message, "content"):
-                    return completion.choices[0].message.content
-                return completion.choices[0]["message"]["content"]
-            except Exception as e2:
-                logger.error(f"All extraction methods failed: {str(e2)}")
-                return "I couldn't process that input. Please try again with a different query."
     except Exception as e:
-        logger.error(f"Error in llm_chat_response: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-# Initialize the audio generation pipeline once at startup
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
-    # The app starts regardless but logs the error
 @app.post("/generate", responses={
     200: {"content": {"application/octet-stream": {}}},
@@ -243,95 +223,69 @@ except Exception as e:
 })
 async def generate_audio(request: TextImageRequest):
     """
-    Generate audio from text and optionally analyze an image.
-    - If text is provided, it is used as input.
-    - If an image is provided (base64), it is saved and a URL is generated for processing.
-    - The LLM response is then converted to speech.
     """
     try:
-        logger.info("Received audio generation request")
-        user_text = request.text if request.text is not None else ""
-        if not user_text and request.image_base64:
-            user_text = "Describe what you see in the image"
-        elif not user_text and not request.image_base64:
-            logger.error("Neither text nor image provided in request")
-            return JSONResponse(
-                status_code=400,
-                content={"error": "Request must include either text or image_base64"}
-            )
-        logger.info("Getting LLM response...")
-        text_reply = llm_chat_response(user_text, request.image_base64)
-        logger.info(f"LLM response: {text_reply}")
-        validated_voice = request.validate_voice()
-        if validated_voice != request.voice:
-            logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
-        logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
-        try:
-            generator = pipeline(
-                text_reply,
-                voice=validated_voice,
-                speed=request.speed,
-                split_pattern=r'\n+'
-            )
-            for i, (gs, ps, audio) in enumerate(generator):
-                logger.info(f"Audio generated successfully: segment {i}")
-                # Convert PyTorch tensor to NumPy array
-                audio_numpy = audio.cpu().numpy()
-                # Clip values to range [-1, 1] and convert to 16-bit PCM
-                audio_numpy = np.clip(audio_numpy, -1, 1)
-                pcm_data = (audio_numpy * 32767).astype(np.int16)
-                raw_audio = pcm_data.tobytes()
-                return Response(
-                    content=raw_audio,
-                    media_type="application/octet-stream",
-                    headers={
-                        "Content-Disposition": 'attachment; filename="output.pcm"',
-                        "X-Sample-Rate": "24000",
-                        "X-Bits-Per-Sample": "16",
-                        "X-Endianness": "little"
-                    }
-                )
-            logger.error("No audio segments generated")
-            return JSONResponse(
-                status_code=400,
-                content={"error": "No audio generated", "detail": "The pipeline did not produce any audio"}
-            )
-        except Exception as e:
-            logger.error(f"Error generating audio: {str(e)}")
-            return JSONResponse(
-                status_code=500,
-                content={"error": "Audio generation failed", "detail": str(e)}
             )
     except Exception as e:
-        logger.error(f"Unexpected error in generate_audio endpoint: {str(e)}")
-        return JSONResponse(
-            status_code=500,
-            content={"error": "Internal server error", "detail": str(e)}
-        )
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with 'text' and optionally 'image_base64' for queries."}
 @app.exception_handler(404)
 async def not_found_handler(request: Request, exc):
-    return JSONResponse(
-        status_code=404,
-        content={"error": "Endpoint not found. Please use POST /generate for queries."}
-    )
 @app.exception_handler(405)
 async def method_not_allowed_handler(request: Request, exc):
-    return JSONResponse(
-        status_code=405,
-        content={"error": "Method not allowed. Please check the API documentation."}
-    )

 #     return Response("No audio generated", status_code=400)
 import os
+import uuid
 import base64
 import logging
+from fastapi import FastAPI, HTTPException, Response, Request
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
 from typing import Optional, ClassVar, List
+from huggingface_hub import InferenceClient
+import numpy as np
+import torch
+from kokoro import KPipeline  # Assuming you have this pipeline for audio generation
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Create FastAPI app
+app = FastAPI(
+    title="Text-to-Speech API with Vision Support",
+    description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct, which requires an image input.",
+    version="1.0.0"
+)
+# Mount a static directory for serving saved images
+STATIC_DIR = "static_images"
+if not os.path.exists(STATIC_DIR):
+    os.makedirs(STATIC_DIR)
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+# Pydantic model for request
 class TextImageRequest(BaseModel):
     text: Optional[str] = None
     image_base64: Optional[str] = None
+    voice: str = "af_heart"  # Default voice
     speed: float = 1.0
+    # Use ClassVar so that Pydantic doesn't treat this as a model field.
     AVAILABLE_VOICES: ClassVar[List[str]] = ["af_heart"]
     def validate_voice(self):
             return "af_heart"
         return self.voice
+# (Optional) Pydantic models for responses
 class AudioResponse(BaseModel):
     status: str
     message: str
     error: str
     detail: Optional[str] = None
+# Function to call the LLM model following the reference code exactly
+def llm_chat_response(text: str, image_base64: str) -> str:
+    HF_TOKEN = os.getenv("HF_TOKEN")
+    logger.info("Checking HF_TOKEN...")
+    if not HF_TOKEN:
+        logger.error("HF_TOKEN not configured")
+        raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
+    logger.info("Initializing InferenceClient...")
+    client = InferenceClient(
+        provider="hf-inference",
+        api_key=HF_TOKEN
+    )
+    # Save the base64-encoded image locally so it is accessible via a URL
+    filename = f"{uuid.uuid4()}.jpg"
+    image_path = os.path.join(STATIC_DIR, filename)
     try:
+        image_data = base64.b64decode(image_base64)
+    except Exception as e:
+        logger.error(f"Error decoding image: {str(e)}")
+        raise HTTPException(status_code=400, detail="Invalid base64 image data")
+    with open(image_path, "wb") as f:
+        f.write(image_data)
+    # Construct the public URL for the saved image.
+    # BASE_URL should be set to your public URL if not running locally.
+    base_url = os.getenv("BASE_URL", "http://localhost:8000")
+    image_url = f"{base_url}/static/{filename}"
+    # Build the message exactly as in the reference code.
+    # This model requires a list with two items: one for text and one for the image.
+    prompt = text if text else "Describe this image in one sentence."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": image_url}}
             ]
+        }
+    ]
+    logger.info(f"Message structure: {messages}")
+    try:
         completion = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
             max_tokens=500
         )
+        response = completion.choices[0].message.content
+        logger.info(f"Extracted response: {response}")
+        return response
     except Exception as e:
+        logger.error(f"Error during model inference: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# Initialize audio generation pipeline (your audio conversion pipeline)
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
+    # The API can still run, but audio generation will fail.
 @app.post("/generate", responses={
     200: {"content": {"application/octet-stream": {}}},
 })
 async def generate_audio(request: TextImageRequest):
     """
+    Generate audio from a multimodal (text+image) input.
+    This model does not support text-only inputs.
     """
+    logger.info("Received generation request")
+    # Ensure an image is provided because the model is multimodal.
+    if not request.image_base64:
+        raise HTTPException(status_code=400, detail="This model requires an image input.")
+    # Get the text prompt. If none is provided, use a default.
+    user_text = request.text if request.text else "Describe this image in one sentence."
+    # Get the LLM's response
+    logger.info("Calling the LLM model")
+    text_reply = llm_chat_response(user_text, request.image_base64)
+    logger.info(f"LLM response: {text_reply}")
+    # Validate voice parameter (if needed for audio generation)
+    validated_voice = request.validate_voice()
+    if validated_voice != request.voice:
+        logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
+    # Convert the text reply to audio using your audio pipeline
+    logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
     try:
+        # Generate audio segments (assumes pipeline yields segments)
+        generator = pipeline(
+            text_reply,
+            voice=validated_voice,
+            speed=request.speed,
+            split_pattern=r'\n+'
+        )
+        for i, (gs, ps, audio) in enumerate(generator):
+            logger.info(f"Audio generated, segment {i}")
+            # Convert audio tensor to 16-bit PCM bytes
+            audio_numpy = audio.cpu().numpy()
+            audio_numpy = np.clip(audio_numpy, -1, 1)
+            pcm_data = (audio_numpy * 32767).astype(np.int16)
+            raw_audio = pcm_data.tobytes()
+            return Response(
+                content=raw_audio,
+                media_type="application/octet-stream",
+                headers={
+                    "Content-Disposition": 'attachment; filename="output.pcm"',
+                    "X-Sample-Rate": "24000",
+                    "X-Bits-Per-Sample": "16",
+                    "X-Endianness": "little"
+                }
             )
+        raise HTTPException(status_code=400, detail="No audio segments generated.")
     except Exception as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
+    return {"message": "Welcome! Use POST /generate with text and image_base64."}
 @app.exception_handler(404)
 async def not_found_handler(request: Request, exc):
+    return JSONResponse(status_code=404, content={"error": "Endpoint not found."})
 @app.exception_handler(405)
 async def method_not_allowed_handler(request: Request, exc):
+    return JSONResponse(status_code=405, content={"error": "Method not allowed."})