Spaces:

dondoesstuff
/

LFM2-1.2B

Runtime error

File size: 9,490 Bytes

c8ae67b

"""
Minimal OpenAI-compatible local server that serves /LiquidAI/LFM2-1.2B via Hugging Face
Transformers on CPU and exposes a subset of the OpenAI REST API (chat/completions, models).

Save as local_openai_compatible_server.py and run:
    pip install -r requirements.txt
    python local_openai_compatible_server.py

Or run with uvicorn directly (recommended for production/dev):
    uvicorn local_openai_compatible_server:app --host 0.0.0.0 --port 7860

Requirements (requirements.txt):
    fastapi
    "uvicorn[standard]"
    transformers
    torch

Notes:
- CPU-only: model loads on CPU (may be slow for a 1.2B model depending on your machine).
- Model repo id used: "/LiquidAI/LFM2-1.2B" — adjust if you have a different path or local copy.
- This provides a simplified compatibility layer. It is NOT feature-complete with OpenAI's API
  but implements common fields: messages, max_tokens, temperature, top_p, n, stop, stream (basic).
"""

from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional, Any, Dict
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import json
import uuid

# -----------------------------
# Configuration
# -----------------------------
MODEL_ID = "/LiquidAI/LFM2-1.2B"  # change to your model location or HF repo
HOST = "0.0.0.0"
PORT = 7860
DEVICE = torch.device("cpu")  # CPU-only as requested
DEFAULT_MAX_TOKENS = 256

# -----------------------------
# Load model & tokenizer
# -----------------------------
print(f"Loading tokenizer and model '{MODEL_ID}' on device {DEVICE} (CPU-only)... this may take a while")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32)
    model.to(DEVICE)
    model.eval()
except Exception as e:
    raise RuntimeError(f"Failed to load model/tokenizer for '{MODEL_ID}': {e}")

# If tokenizer has no pad/eos, try to set sensible defaults
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

# -----------------------------
# FastAPI app
# -----------------------------
app = FastAPI(title="Local OpenAI-compatible server (transformers)", version="0.1")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# -----------------------------
# Pydantic models (request bodies)
# -----------------------------
class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: Optional[str] = MODEL_ID
    messages: List[Message]
    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS
    temperature: Optional[float] = 0.0
    top_p: Optional[float] = 1.0
    n: Optional[int] = 1
    stop: Optional[List[str]] = None
    stream: Optional[bool] = False

# -----------------------------
# Helpers
# -----------------------------
def build_prompt_from_messages(messages: List[Dict[str, Any]]) -> str:
    # Simple conversational prompt formatting. Adjust to suit model's expected format.
    parts = []
    for m in messages:
        role = m.get("role", "user")
        content = m.get("content", "")
        if role == "system":
            parts.append(f"<|system|> {content}\n")
        elif role == "user":
            parts.append(f"User: {content}\n")
        elif role == "assistant":
            parts.append(f"Assistant: {content}\n")
        else:
            parts.append(f"{role}: {content}\n")
    parts.append("Assistant: ")
    return "".join(parts)


def apply_stop_sequences(text: str, stops: Optional[List[str]]) -> str:
    if not stops:
        return text
    idx = None
    for s in stops:
        if s == "":
            continue
        pos = text.find(s)
        if pos != -1:
            if idx is None or pos < idx:
                idx = pos
    if idx is not None:
        return text[:idx]
    return text

# -----------------------------
# Endpoints
# -----------------------------
@app.get("/", response_class=PlainTextResponse)
async def root():
    return "Local OpenAI-compatible server running. Use /v1/chat/completions or /v1/models"

@app.get("/v1/models")
async def list_models():
    return {"data": [{"id": MODEL_ID, "object": "model"}], "object": "list"}

@app.post("/v1/chat/completions")
async def chat_completions(request: Request, body: ChatCompletionRequest):
    # Basic validation
    if body.model is None or body.model != MODEL_ID:
        # Allow the default model but warn if mismatched
        raise HTTPException(status_code=400, detail={"error": "invalid_model", "message": f"Only model {MODEL_ID} is available on this server."})

    prompt = build_prompt_from_messages([m.dict() for m in body.messages])

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(DEVICE)
    input_len = input_ids.shape[-1]

    # Generation settings
    gen_kwargs = {
        "max_new_tokens": body.max_tokens,
        "do_sample": bool(body.temperature and body.temperature > 0.0),
        "temperature": float(body.temperature or 0.0),
        "top_p": float(body.top_p or 1.0),
        "num_return_sequences": int(body.n or 1),
        "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
        # note: on CPU large models may be slow
    }

    # Synchronous generation
    with torch.no_grad():
        outputs = model.generate(input_ids, **gen_kwargs)

    choices = []
    for i, out_ids in enumerate(outputs):
        full_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        # Attempt to strip the prompt prefix to return only generated reply
        # find the last occurrence of the prompt in full_text (best-effort)
        stripped = full_text
        try:
            # prefer exact match; fallback to trimming by token count
            if prompt.strip() and prompt in full_text:
                stripped = full_text.split(prompt, 1)[1]
            else:
                # fallback: remove first input_len tokens from decoded sequence
                decoded_all = full_text
                # naive fallback: no-op (we keep the full_text)
                stripped = decoded_all
        except Exception:
            stripped = full_text

        # apply stop sequences
        stripped = apply_stop_sequences(stripped, body.stop)

        # build choice structure similar to OpenAI
        choice = {
            "index": i,
            "message": {"role": "assistant", "content": stripped},
            "finish_reason": "stop" if body.stop else "length",
        }
        choices.append(choice)

    # approximate token usage
    completion_tokens = max(0, (outputs.shape[-1] - input_len) if outputs is not None else 0)
    usage = {"prompt_tokens": int(input_len), "completion_tokens": int(completion_tokens), "total_tokens": int(input_len + completion_tokens)}

    response = {
        "id": str(uuid.uuid4()),
        "object": "chat.completion",
        "created": int(time.time()),
        "model": body.model,
        "choices": choices,
        "usage": usage,
    }

    # Streaming: rudimentary implementation that streams chunks of the final text as SSE
    if body.stream:
        # Only support streaming a single response (n > 1 will still stream the first)
        text_to_stream = choices[0]["message"]["content"]
        def event_stream():
            # send a few small chunks
            chunk_size = 128
            for start in range(0, len(text_to_stream), chunk_size):
                chunk = text_to_stream[start:start+chunk_size]
                payload = {"id": response["id"], "object": "chat.completion.chunk", "choices": [{"delta": {"content": chunk}, "index": 0}]}
                yield f"data: {json.dumps(payload)}\n\n"
            # final done message
            done_payload = {"id": response["id"], "object": "chat.completion.chunk", "choices": [{"delta": {}, "index": 0}], "done": True}
            yield f"data: {json.dumps(done_payload)}\n\n"
        return StreamingResponse(event_stream(), media_type="text/event-stream")

    return JSONResponse(response)

# A convenience POST /v1/completions that accepts 'prompt' (legacy completions API)
class CompletionRequest(BaseModel):
    model: Optional[str] = MODEL_ID
    prompt: Optional[str] = ""
    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS
    temperature: Optional[float] = 0.0
    top_p: Optional[float] = 1.0
    n: Optional[int] = 1
    stop: Optional[List[str]] = None
    stream: Optional[bool] = False

@app.post("/v1/completions")
async def completions(req: CompletionRequest):
    # wrap prompt into the chat-format for our generator
    messages = [Message(role="user", content=req.prompt)]
    chat_req = ChatCompletionRequest(model=req.model, messages=messages, max_tokens=req.max_tokens, temperature=req.temperature, top_p=req.top_p, n=req.n, stop=req.stop, stream=req.stream)
    # call the chat_completions handler directly
    return await chat_completions(Request(scope={}), chat_req)

# -----------------------------
# If executed directly, run uvicorn
# -----------------------------
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("local_openai_compatible_server:app", host=HOST, port=PORT, log_level="info")