Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, HTTPException, Request | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.responses import StreamingResponse | |
from pydantic import BaseModel, Field | |
from typing import List | |
import os | |
from huggingface_hub import InferenceClient | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class Message(BaseModel): | |
role: str = Field(..., description="Role of the message sender (system/user/assistant)") | |
content: str = Field(..., description="Content of the message") | |
class ChatInput(BaseModel): | |
messages: List[Message] = Field(..., description="List of conversation messages") | |
max_tokens: int = Field(default=2048, gt=0, le=4096, description="Maximum number of tokens to generate") | |
temperature: float = Field(default=0.5, gt=0, le=2.0, description="Temperature for sampling") | |
top_p: float = Field(default=0.7, gt=0, le=1.0, description="Top-p sampling parameter") | |
app = FastAPI() | |
# Add CORS middleware | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Initialize Hugging Face client | |
hf_client = InferenceClient( | |
model=os.getenv("MODEL_ID", "mistralai/Mistral-Nemo-Instruct-2407"), # default model added to client | |
token=os.getenv("HF_TOKEN"), # renamed api_key to token | |
timeout=30 | |
) | |
MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407" | |
async def generate_stream(messages: List[Message], max_tokens: int, temperature: float, top_p: float): | |
"""Generate streaming response using Hugging Face Inference API.""" | |
try: | |
# Convert messages to the format expected by the API | |
formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages] | |
# Stream the response chunks | |
for chunk in hf_client.text_generation( | |
prompt= formatted_messages, # updated to text_generation | |
details=True, | |
max_new_tokens=max_tokens, # renamed max_tokens to max_new_tokens | |
temperature=temperature, | |
top_p=top_p, | |
do_sample=True, | |
stream=True, | |
): | |
if chunk.token.text is not None: | |
yield chunk.token.text | |
except Exception as e: | |
logger.error(f"Error in generate_stream: {e}", exc_info=True) | |
raise ValueError(f"Error generating response: {e}") | |
async def chat_stream(input: ChatInput, request: Request): | |
"""Stream chat completions based on the input messages.""" | |
try: | |
if not os.getenv("HF_TOKEN"): | |
raise HTTPException( | |
status_code=500, | |
detail="HF_TOKEN environment variable not set" | |
) | |
logger.info(f"Received chat request from {request.client.host}") | |
logger.info(f"Number of messages: {len(input.messages)}") | |
return StreamingResponse( | |
generate_stream( | |
messages=input.messages, | |
max_tokens=input.max_tokens, | |
temperature=input.temperature, | |
top_p=input.top_p | |
), | |
media_type="text/event-stream" | |
) | |
except Exception as e: | |
logger.error(f"Error in chat_stream endpoint: {e}", exc_info=True) | |
raise HTTPException(status_code=500, detail=str(e)) | |
async def root(): | |
"""Root endpoint that returns a welcome message.""" | |
return { | |
"message": "Welcome to the Hugging Face Inference API Streaming Chat!", | |
"status": "running", | |
"model": MODEL_ID | |
} | |
async def health_check(): | |
"""Health check endpoint.""" | |
return { | |
"status": "healthy", | |
"model": MODEL_ID, | |
"hf_token_set": bool(os.getenv("HF_TOKEN")) | |
} | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info") |