Spaces:

cong182
/

firstAI

Running

App Files Files Community

ndc8 commited on 12 days ago

Commit

8d9c495

1 Parent(s): 3239c69

upd

Browse files

Files changed (1) hide show

backend_service.py +52 -93

backend_service.py CHANGED Viewed

@@ -15,7 +15,6 @@ warnings.filterwarnings("ignore", message=".*rope_scaling.*")
 os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
 # Suppress advisory warnings from transformers (including deprecation warnings)
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
-# Define Hugging Face auth token from environment
 hf_token = os.environ.get("HF_TOKEN")
 import asyncio
 import logging
@@ -28,10 +27,11 @@ from fastapi import FastAPI, HTTPException, Depends, Request
 from fastapi.responses import StreamingResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, field_validator
-from huggingface_hub import InferenceClient
 import uvicorn
 import requests
 from PIL import Image
 # Transformers imports (now required)
 try:
@@ -128,12 +128,13 @@ class CompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(default=512, ge=1, le=2048)
     temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
 # Global variables for model management
-inference_client: Optional[InferenceClient] = None
-image_text_pipeline = None  # type: ignore
 current_model = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
 vision_model = "Salesforce/blip-image-captioning-base"  # Working model for image captioning
 tokenizer = None
 # Image processing utilities
 async def download_image(url: str) -> Image.Image:
@@ -173,23 +174,22 @@ def has_images(messages: List[ChatMessage]) -> bool:
                     return True
     return False
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for startup and shutdown events"""
-    global inference_client, tokenizer, image_text_pipeline
-    # Startup
     logger.info("🚀 Starting AI Backend Service...")
     try:
-        # Initialize HuggingFace Inference Client for text generation
-        inference_client = InferenceClient(model=current_model)
-        logger.info(f"✅ Initialized inference client with model: {current_model}")
-        # Initialize image-text-to-text pipeline
         if transformers_available and pipeline:
             try:
                 logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
-                image_text_pipeline = pipeline("image-to-text", model=vision_model)  # Use image-to-text task
                 logger.info("✅ Image captioning pipeline loaded successfully")
             except Exception as e:
                 logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
@@ -197,37 +197,13 @@ async def lifespan(app: FastAPI):
         else:
             logger.warning("⚠️ Transformers not available, image processing disabled")
             image_text_pipeline = None
-        # Initialize tokenizer for better text handling
-        if transformers_available and AutoTokenizer:
-            try:
-                # Load tokenizer, using auth token if provided
-                if hf_token:
-                    tokenizer = AutoTokenizer.from_pretrained(
-                        current_model,
-                        token=hf_token
-                    )  # type: ignore
-                else:
-                    tokenizer = AutoTokenizer.from_pretrained(
-                        current_model
-                    )  # type: ignore
-                logger.info("✅ Tokenizer loaded successfully")
-            except Exception as e:
-                logger.warning(f"⚠️ Could not load tokenizer: {e}")
-                tokenizer = None
-        else:
-            logger.info("⚠️ Tokenizer initialization skipped")
     except Exception as e:
-        logger.error(f"❌ Failed to initialize inference client: {e}")
         raise RuntimeError(f"Service initialization failed: {e}")
     yield
-    # Shutdown
     logger.info("🔄 Shutting down AI Backend Service...")
-    inference_client = None
     tokenizer = None
     image_text_pipeline = None
 # Initialize FastAPI app
@@ -247,11 +223,10 @@ app.add_middleware(
     allow_headers=["*"],
 )
-def get_inference_client() -> InferenceClient:
-    """Dependency to get the inference client"""
-    if inference_client is None:
-        raise HTTPException(status_code=503, detail="Service not ready - inference client not initialized")
-    return inference_client
 def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
     """Convert OpenAI messages format to a single prompt string"""
@@ -341,36 +316,30 @@ async def generate_multimodal_response(
         logger.error(f"Error in multimodal generation: {e}")
         return f"I'm having trouble processing the image. Error: {str(e)}"
-def generate_response_safe(client: InferenceClient, prompt: str, max_tokens: int, temperature: float, top_p: float) -> str:
-    """Safely generate response from the model with fallback methods"""
     try:
-        # Method 1: Try text_generation with new parameters
-        response_text = client.text_generation(
-            prompt=prompt,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            return_full_text=False,
-            stop=["Human:", "System:"]  # Use stop instead of stop_sequences
         )
-        return response_text.strip() if response_text else "I apologize, but I couldn't generate a response."
     except Exception as e:
-        logger.warning(f"text_generation failed: {e}")
-        # Method 2: Try with minimal parameters
-        try:
-            response_text = client.text_generation(
-                prompt=prompt,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                return_full_text=False
-            )
-            return response_text.strip() if response_text else "I apologize, but I couldn't generate a response."
-        except Exception as e2:
-            logger.error(f"All generation methods failed: {e2}")
-            return "I apologize, but I'm having trouble generating a response right now. Please try again."
 async def generate_streaming_response(
     client: InferenceClient,
@@ -491,10 +460,10 @@ async def list_models():
         # ...existing code...
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def create_chat_completion(
-    request: ChatCompletionRequest,
-    client: InferenceClient = Depends(get_inference_client)
 ) -> ChatCompletionResponse:
     """Create a chat completion (OpenAI-compatible) with multimodal support."""
     try:
@@ -506,22 +475,10 @@ async def create_chat_completion(
                 raise HTTPException(status_code=503, detail="Image processing not available")
             response_text = await generate_multimodal_response(request.messages, request)
         else:
-            prompt = convert_messages_to_prompt(request.messages)
-            logger.info(f"Generated prompt: {prompt[:200]}...")
-            if request.stream:
-                return StreamingResponse(
-                    generate_streaming_response(client, prompt, request),
-                    media_type="text/plain",
-                    headers={
-                        "Cache-Control": "no-cache",
-                        "Connection": "keep-alive",
-                        "Content-Type": "text/plain; charset=utf-8"
-                    }
-                )  # type: ignore
             response_text = await asyncio.to_thread(
-                generate_response_safe,
-                client,
-                prompt,
                 request.max_tokens or 512,
                 request.temperature or 0.7,
                 request.top_p or 0.95
@@ -542,19 +499,21 @@ async def create_chat_completion(
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/v1/completions")
 async def create_completion(
-    request: CompletionRequest,
-    client: InferenceClient = Depends(get_inference_client)
 ) -> Dict[str, Any]:
     """Create a text completion (OpenAI-compatible)"""
     try:
         if not request.prompt:
             raise HTTPException(status_code=400, detail="Prompt cannot be empty")
         response_text = await asyncio.to_thread(
-            generate_response_safe,
-            client,
-            request.prompt,
             request.max_tokens or 512,
             request.temperature or 0.7,
             0.95

 os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
 # Suppress advisory warnings from transformers (including deprecation warnings)
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 hf_token = os.environ.get("HF_TOKEN")
 import asyncio
 import logging
 from fastapi.responses import StreamingResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, field_validator
 import uvicorn
 import requests
 from PIL import Image
+from transformers import AutoTokenizer, AutoModelForCausalLM
 # Transformers imports (now required)
 try:
     max_tokens: Optional[int] = Field(default=512, ge=1, le=2048)
     temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
 # Global variables for model management
 current_model = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
 vision_model = "Salesforce/blip-image-captioning-base"  # Working model for image captioning
 tokenizer = None
+model = None
+image_text_pipeline = None  # type: ignore
 # Image processing utilities
 async def download_image(url: str) -> Image.Image:
                     return True
     return False
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for startup and shutdown events"""
+    global tokenizer, model, image_text_pipeline
     logger.info("🚀 Starting AI Backend Service...")
     try:
+        # Load local tokenizer and model
+        tokenizer = AutoTokenizer.from_pretrained(current_model)
+        model = AutoModelForCausalLM.from_pretrained(current_model)
+        logger.info(f"✅ Loaded local model and tokenizer: {current_model}")
+        # Optionally, load image pipeline as before
         if transformers_available and pipeline:
             try:
                 logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
+                image_text_pipeline = pipeline("image-to-text", model=vision_model)
                 logger.info("✅ Image captioning pipeline loaded successfully")
             except Exception as e:
                 logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
         else:
             logger.warning("⚠️ Transformers not available, image processing disabled")
             image_text_pipeline = None
     except Exception as e:
+        logger.error(f"❌ Failed to initialize local model: {e}")
         raise RuntimeError(f"Service initialization failed: {e}")
     yield
     logger.info("🔄 Shutting down AI Backend Service...")
     tokenizer = None
+    model = None
     image_text_pipeline = None
 # Initialize FastAPI app
     allow_headers=["*"],
 )
+def ensure_model_ready():
+    if tokenizer is None or model is None:
+        raise HTTPException(status_code=503, detail="Service not ready - model not initialized")
 def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
     """Convert OpenAI messages format to a single prompt string"""
         logger.error(f"Error in multimodal generation: {e}")
         return f"I'm having trouble processing the image. Error: {str(e)}"
+def generate_response_local(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
+    """Generate response using local model and tokenizer with chat template."""
+    ensure_model_ready()
     try:
+        # Convert messages to OpenAI format for chat template
+        chat_messages = []
+        for m in messages:
+            chat_messages.append({"role": m.role, "content": m.content if isinstance(m.content, str) else extract_text_and_images(m.content)[0]})
+        inputs = tokenizer.apply_chat_template(
+            chat_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
         )
+        inputs = inputs.to(model.device)
+        outputs = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_p=top_p)
+        # Only decode the newly generated tokens
+        generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+        return generated.strip()
     except Exception as e:
+        logger.error(f"Local generation failed: {e}")
+        return "I apologize, but I'm having trouble generating a response right now. Please try again."
 async def generate_streaming_response(
     client: InferenceClient,
         # ...existing code...
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def create_chat_completion(
+    request: ChatCompletionRequest
 ) -> ChatCompletionResponse:
     """Create a chat completion (OpenAI-compatible) with multimodal support."""
     try:
                 raise HTTPException(status_code=503, detail="Image processing not available")
             response_text = await generate_multimodal_response(request.messages, request)
         else:
+            logger.info(f"Generating local response for messages: {request.messages}")
             response_text = await asyncio.to_thread(
+                generate_response_local,
+                request.messages,
                 request.max_tokens or 512,
                 request.temperature or 0.7,
                 request.top_p or 0.95
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/v1/completions")
 async def create_completion(
+    request: CompletionRequest
 ) -> Dict[str, Any]:
     """Create a text completion (OpenAI-compatible)"""
     try:
         if not request.prompt:
             raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+        ensure_model_ready()
+        # Use the prompt as a single user message
+        messages = [ChatMessage(role="user", content=request.prompt)]
         response_text = await asyncio.to_thread(
+            generate_response_local,
+            messages,
             request.max_tokens or 512,
             request.temperature or 0.7,
             0.95