Spaces:

Yadav122
/

llm-ai-agent

Sleeping

App Files Files Community

Yadav122 commited on 8 days ago

Commit

25f8aca

verified ·

1 Parent(s): 8aea355

Fix: Use simplified app for better compatibility

Browse files

Files changed (1) hide show

app.py +108 -145

app.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import os
-import secrets
-import hashlib
-from typing import Optional, Dict, Any
-from datetime import datetime, timedelta
 import logging
 from fastapi import FastAPI, HTTPException, Depends, Security, status
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import uvicorn
 # Configure logging
@@ -21,15 +17,13 @@ logger = logging.getLogger(__name__)
 app = FastAPI(
     title="LLM AI Agent API",
     description="Secure AI Agent API with Local LLM deployment",
-    version="1.0.0",
-    docs_url="/docs",
-    redoc_url="/redoc"
 )
-# CORS middleware for cross-origin requests
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Configure this for production
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
@@ -39,135 +33,101 @@ app.add_middleware(
 security = HTTPBearer()
 # Configuration
-class Config:
-    # API Keys - In production, use environment variables
-    API_KEYS = {
-        os.getenv("API_KEY_1", "your-secure-api-key-1"): "user1",
-        os.getenv("API_KEY_2", "your-secure-api-key-2"): "user2",
-        # Add more API keys as needed
-    }
-    # Model configuration
-    MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/DialoGPT-medium")  # Lightweight model for free tier
-    MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
-    TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
-    TOP_P = float(os.getenv("TOP_P", "0.9"))
-    # Rate limiting (requests per minute per API key)
-    RATE_LIMIT = int(os.getenv("RATE_LIMIT", "10"))
-# Global variables for model and tokenizer
 model = None
 tokenizer = None
-text_generator = None
 # Request/Response models
 class ChatRequest(BaseModel):
-    message: str = Field(..., min_length=1, max_length=1000, description="Input message for the AI agent")
-    max_length: Optional[int] = Field(None, ge=10, le=2048, description="Maximum response length")
-    temperature: Optional[float] = Field(None, ge=0.1, le=2.0, description="Response creativity (0.1-2.0)")
-    system_prompt: Optional[str] = Field(None, max_length=500, description="Optional system prompt")
 class ChatResponse(BaseModel):
     response: str
     model_used: str
     timestamp: str
-    tokens_used: int
     processing_time: float
 class HealthResponse(BaseModel):
     status: str
     model_loaded: bool
     timestamp: str
-    version: str
-# Rate limiting storage (in production, use Redis)
-request_counts: Dict[str, Dict[str, int]] = {}
 def verify_api_key(credentials: HTTPAuthorizationCredentials = Security(security)) -> str:
     """Verify API key authentication"""
     api_key = credentials.credentials
-    if api_key not in Config.API_KEYS:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid API key",
-            headers={"WWW-Authenticate": "Bearer"},
         )
-    return Config.API_KEYS[api_key]
-def check_rate_limit(api_key: str) -> bool:
-    """Simple rate limiting implementation"""
-    current_minute = datetime.now().strftime("%Y-%m-%d-%H-%M")
-    if api_key not in request_counts:
-        request_counts[api_key] = {}
-    if current_minute not in request_counts[api_key]:
-        request_counts[api_key][current_minute] = 0
-    if request_counts[api_key][current_minute] >= Config.RATE_LIMIT:
-        return False
-    request_counts[api_key][current_minute] += 1
-    return True
 @app.on_event("startup")
 async def load_model():
     """Load the LLM model on startup"""
-    global model, tokenizer, text_generator
     try:
-        logger.info(f"Loading model: {Config.MODEL_NAME}")
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
-        # Add padding token if it doesn't exist
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        # Load model with optimizations for free tier
-        model = AutoModelForCausalLM.from_pretrained(
-            Config.MODEL_NAME,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            low_cpu_mem_usage=True
-        )
-        # Create text generation pipeline
-        text_generator = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device=0 if torch.cuda.is_available() else -1
-        )
-        logger.info("Model loaded successfully!")
     except Exception as e:
-        logger.error(f"Error loading model: {str(e)}")
-        raise e
 @app.get("/", response_model=HealthResponse)
 async def root():
     """Health check endpoint"""
     return HealthResponse(
         status="healthy",
-        model_loaded=model is not None,
-        timestamp=datetime.now().isoformat(),
-        version="1.0.0"
     )
 @app.get("/health", response_model=HealthResponse)
 async def health_check():
     """Detailed health check"""
     return HealthResponse(
-        status="healthy" if model is not None else "model_not_loaded",
-        model_loaded=model is not None,
-        timestamp=datetime.now().isoformat(),
-        version="1.0.0"
     )
 @app.post("/chat", response_model=ChatResponse)
@@ -178,58 +138,62 @@ async def chat(
     """Main chat endpoint for AI agent interaction"""
     start_time = datetime.now()
-    # Check rate limiting
-    api_key = None  # In a real implementation, you'd extract this from the token
-    # if not check_rate_limit(api_key):
-    #     raise HTTPException(
-    #         status_code=status.HTTP_429_TOO_MANY_REQUESTS,
-    #         detail="Rate limit exceeded. Please try again later."
-    #     )
-    if model is None or tokenizer is None:
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Model not loaded. Please try again later."
-        )
     try:
-        # Prepare input
-        input_text = request.message
-        if request.system_prompt:
-            input_text = f"System: {request.system_prompt}\nUser: {request.message}\nAssistant:"
-        # Generate response
-        max_length = request.max_length or Config.MAX_LENGTH
-        temperature = request.temperature or Config.TEMPERATURE
-        # Generate text
-        generated = text_generator(
-            input_text,
-            max_length=max_length,
-            temperature=temperature,
-            top_p=Config.TOP_P,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
-            num_return_sequences=1,
-            truncation=True
-        )
-        # Extract response
-        response_text = generated[0]['generated_text']
-        if input_text in response_text:
-            response_text = response_text.replace(input_text, "").strip()
         # Calculate processing time
         processing_time = (datetime.now() - start_time).total_seconds()
-        # Count tokens (approximate)
-        tokens_used = len(tokenizer.encode(response_text))
         return ChatResponse(
             response=response_text,
-            model_used=Config.MODEL_NAME,
             timestamp=datetime.now().isoformat(),
-            tokens_used=tokens_used,
             processing_time=processing_time
         )
@@ -244,18 +208,17 @@ async def chat(
 async def get_model_info(user: str = Depends(verify_api_key)):
     """Get information about the loaded model"""
     return {
-        "model_name": Config.MODEL_NAME,
-        "model_loaded": model is not None,
-        "max_length": Config.MAX_LENGTH,
-        "temperature": Config.TEMPERATURE,
-        "device": "cuda" if torch.cuda.is_available() else "cpu"
     }
 if __name__ == "__main__":
-    # For local development
     uvicorn.run(
-        "app:app",
         host="0.0.0.0",
-        port=int(os.getenv("PORT", "7860")),  # Hugging Face Spaces uses port 7860
         reload=False
     )

 import os
 import logging
+from typing import Optional
+from datetime import datetime
 from fastapi import FastAPI, HTTPException, Depends, Security, status
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 import uvicorn
 # Configure logging
 app = FastAPI(
     title="LLM AI Agent API",
     description="Secure AI Agent API with Local LLM deployment",
+    version="1.0.0"
 )
+# CORS middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 security = HTTPBearer()
 # Configuration
+API_KEYS = {
+    os.getenv("API_KEY_1", "27Eud5J73j6SqPQAT2ioV-CtiCg-p0WNqq6I4U0Ig6E"): "user1",
+    os.getenv("API_KEY_2", "QbzG2CqHU1Nn6F1EogZ1d3dp8ilRTMJQBwTJDQBzS-U"): "user2",
+}
+# Global variables for model
 model = None
 tokenizer = None
+model_loaded = False
 # Request/Response models
 class ChatRequest(BaseModel):
+    message: str = Field(..., min_length=1, max_length=1000)
+    max_length: Optional[int] = Field(100, ge=10, le=500)
+    temperature: Optional[float] = Field(0.7, ge=0.1, le=2.0)
 class ChatResponse(BaseModel):
     response: str
     model_used: str
     timestamp: str
     processing_time: float
 class HealthResponse(BaseModel):
     status: str
     model_loaded: bool
     timestamp: str
 def verify_api_key(credentials: HTTPAuthorizationCredentials = Security(security)) -> str:
     """Verify API key authentication"""
     api_key = credentials.credentials
+    if api_key not in API_KEYS:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid API key"
         )
+    return API_KEYS[api_key]
 @app.on_event("startup")
 async def load_model():
     """Load the LLM model on startup"""
+    global model, tokenizer, model_loaded
     try:
+        logger.info("Loading model...")
+        # Try to import and load transformers
+        try:
+            from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+            import torch
+            model_name = os.getenv("MODEL_NAME", "microsoft/DialoGPT-small")
+            logger.info(f"Loading model: {model_name}")
+            # Load tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+            # Load model
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float32,  # Use float32 for compatibility
+                low_cpu_mem_usage=True
+            )
+            model_loaded = True
+            logger.info("Model loaded successfully!")
+        except Exception as e:
+            logger.warning(f"Could not load transformers model: {e}")
+            logger.info("Running in demo mode with simple responses")
+            model_loaded = False
     except Exception as e:
+        logger.error(f"Error during startup: {str(e)}")
+        model_loaded = False
 @app.get("/", response_model=HealthResponse)
 async def root():
     """Health check endpoint"""
     return HealthResponse(
         status="healthy",
+        model_loaded=model_loaded,
+        timestamp=datetime.now().isoformat()
     )
 @app.get("/health", response_model=HealthResponse)
 async def health_check():
     """Detailed health check"""
     return HealthResponse(
+        status="healthy" if model_loaded else "demo_mode",
+        model_loaded=model_loaded,
+        timestamp=datetime.now().isoformat()
     )
 @app.post("/chat", response_model=ChatResponse)
     """Main chat endpoint for AI agent interaction"""
     start_time = datetime.now()
     try:
+        if model_loaded and model is not None and tokenizer is not None:
+            # Use actual model
+            from transformers import pipeline
+            generator = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                device=-1  # Use CPU
+            )
+            # Generate response
+            generated = generator(
+                request.message,
+                max_length=request.max_length,
+                temperature=request.temperature,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                num_return_sequences=1
+            )
+            response_text = generated[0]['generated_text']
+            if request.message in response_text:
+                response_text = response_text.replace(request.message, "").strip()
+            model_used = os.getenv("MODEL_NAME", "microsoft/DialoGPT-small")
+        else:
+            # Demo mode - simple responses
+            demo_responses = {
+                "hello": "Hello! I'm your AI assistant. How can I help you today?",
+                "hi": "Hi there! I'm ready to assist you.",
+                "how are you": "I'm doing well, thank you for asking! How can I help you?",
+                "what is ai": "AI (Artificial Intelligence) is the simulation of human intelligence in machines that are programmed to think and learn.",
+                "machine learning": "Machine learning is a subset of AI that enables computers to learn and improve from experience without being explicitly programmed.",
+                "default": "I'm an AI assistant ready to help you. Could you please rephrase your question?"
+            }
+            message_lower = request.message.lower()
+            response_text = demo_responses.get("default", "I'm here to help!")
+            for key, response in demo_responses.items():
+                if key in message_lower:
+                    response_text = response
+                    break
+            model_used = "demo_mode"
         # Calculate processing time
         processing_time = (datetime.now() - start_time).total_seconds()
         return ChatResponse(
             response=response_text,
+            model_used=model_used,
             timestamp=datetime.now().isoformat(),
             processing_time=processing_time
         )
 async def get_model_info(user: str = Depends(verify_api_key)):
     """Get information about the loaded model"""
     return {
+        "model_name": os.getenv("MODEL_NAME", "microsoft/DialoGPT-small"),
+        "model_loaded": model_loaded,
+        "status": "loaded" if model_loaded else "demo_mode"
     }
 if __name__ == "__main__":
+    # For local development and Hugging Face Spaces
+    port = int(os.getenv("PORT", "7860"))
     uvicorn.run(
+        "app_simple:app",
         host="0.0.0.0",
+        port=port,
         reload=False
     )