dondoesstuff commited on
Commit
a7e85e8
·
verified ·
1 Parent(s): c3cc224

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +63 -12
  2. app.py +222 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,63 @@
1
- ---
2
- title: LFM2 1.2B
3
- emoji: 📈
4
- colorFrom: red
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI-compatible API for LiquidAI/LFM2-1.2B
2
+
3
+ This is a minimal FastAPI server that exposes OpenAI-compatible endpoints backed by the Hugging Face Transformers model `LiquidAI/LFM2-1.2B`.
4
+
5
+ Endpoints:
6
+ - `POST /v1/chat/completions` (OpenAI Chat Completions)
7
+ - `POST /v1/completions` (OpenAI Completions)
8
+ - `GET /health` health check
9
+
10
+ Runs on port 7860 by default.
11
+
12
+ ## Setup
13
+
14
+ 1. Create and activate a Python environment (recommended).
15
+ 2. Install dependencies:
16
+
17
+ ```bash
18
+ pip install -r requirements.txt
19
+ ```
20
+
21
+ 3. Run the server:
22
+
23
+ ```bash
24
+ python app.py
25
+ ```
26
+
27
+ The API will be available at `http://localhost:7860`. Interactive docs: `http://localhost:7860/docs`.
28
+
29
+ ## Example requests
30
+
31
+ Chat:
32
+
33
+ ```bash
34
+ curl http://localhost:7860/v1/chat/completions \
35
+ -H "Content-Type: application/json" \
36
+ -d '{
37
+ "model": "LiquidAI/LFM2-1.2B",
38
+ "messages": [
39
+ {"role": "system", "content": "You are a helpful assistant."},
40
+ {"role": "user", "content": "Write a haiku about the ocean"}
41
+ ],
42
+ "temperature": 0.7,
43
+ "max_tokens": 128
44
+ }'
45
+ ```
46
+
47
+ Completions:
48
+
49
+ ```bash
50
+ curl http://localhost:7860/v1/completions \
51
+ -H "Content-Type: application/json" \
52
+ -d '{
53
+ "model": "LiquidAI/LFM2-1.2B",
54
+ "prompt": "Explain quantum computing in simple terms",
55
+ "temperature": 0.7,
56
+ "max_tokens": 128
57
+ }'
58
+ ```
59
+
60
+ ## Notes
61
+ - The server auto-selects FP16/BF16 on CUDA if available, otherwise runs on CPU (slow).
62
+ - Configure with env vars: `MODEL_ID`, `MAX_TOKENS`, `PORT`.
63
+ - This minimal server supports only `n=1` and returns the first completion.
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import uuid
4
+ from typing import List, Optional, Dict, Any
5
+
6
+ import torch
7
+ from fastapi import FastAPI, HTTPException
8
+ from fastapi.responses import RedirectResponse
9
+ from pydantic import BaseModel, Field
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+
12
+ MODEL_ID = os.getenv("MODEL_ID", "LiquidAI/LFM2-1.2B")
13
+ DEFAULT_MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
14
+
15
+ app = FastAPI(title="OpenAI-compatible API for LiquidAI/LFM2-1.2B")
16
+
17
+ tokenizer = None
18
+ model = None
19
+
20
+
21
+ def get_dtype() -> torch.dtype:
22
+ if torch.cuda.is_available():
23
+ # Prefer bfloat16 if supported; else float16
24
+ if torch.cuda.is_bf16_supported():
25
+ return torch.bfloat16
26
+ return torch.float16
27
+ # CPU
28
+ return torch.float32
29
+
30
+
31
+ @app.on_event("startup")
32
+ def load_model():
33
+ global tokenizer, model
34
+ dtype = get_dtype()
35
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
36
+ model = AutoModelForCausalLM.from_pretrained(
37
+ MODEL_ID,
38
+ torch_dtype=dtype,
39
+ device_map="auto",
40
+ trust_remote_code=True,
41
+ )
42
+ # Ensure eos/bos tokens exist
43
+ if tokenizer.eos_token is None:
44
+ tokenizer.eos_token = tokenizer.sep_token or tokenizer.pad_token or "</s>"
45
+ if tokenizer.pad_token is None:
46
+ tokenizer.pad_token = tokenizer.eos_token
47
+
48
+
49
+ class ChatMessage(BaseModel):
50
+ role: str
51
+ content: str
52
+
53
+
54
+ class ChatCompletionRequest(BaseModel):
55
+ model: Optional[str] = Field(default=MODEL_ID)
56
+ messages: List[ChatMessage]
57
+ temperature: Optional[float] = 0.7
58
+ top_p: Optional[float] = 0.95
59
+ max_tokens: Optional[int] = None
60
+ stop: Optional[List[str] | str] = None
61
+ n: Optional[int] = 1
62
+
63
+
64
+ class CompletionRequest(BaseModel):
65
+ model: Optional[str] = Field(default=MODEL_ID)
66
+ prompt: str | List[str]
67
+ temperature: Optional[float] = 0.7
68
+ top_p: Optional[float] = 0.95
69
+ max_tokens: Optional[int] = None
70
+ stop: Optional[List[str] | str] = None
71
+ n: Optional[int] = 1
72
+
73
+
74
+ class Usage(BaseModel):
75
+ prompt_tokens: int
76
+ completion_tokens: int
77
+ total_tokens: int
78
+
79
+
80
+ # Simple chat prompt formatter
81
+
82
+ def build_chat_prompt(messages: List[ChatMessage]) -> str:
83
+ system_prefix = "You are a helpful assistant."
84
+ system_msgs = [m.content for m in messages if m.role == "system"]
85
+ if system_msgs:
86
+ system_prefix = system_msgs[-1]
87
+
88
+ conv: List[str] = [f"System: {system_prefix}"]
89
+ for m in messages:
90
+ if m.role == "system":
91
+ continue
92
+ role = "User" if m.role == "user" else ("Assistant" if m.role == "assistant" else m.role.capitalize())
93
+ conv.append(f"{role}: {m.content}")
94
+ conv.append("Assistant:")
95
+ return "\n".join(conv)
96
+
97
+
98
+ def apply_stop_sequences(text: str, stop: Optional[List[str] | str]) -> str:
99
+ if stop is None:
100
+ return text
101
+ stops = stop if isinstance(stop, list) else [stop]
102
+ cut = len(text)
103
+ for s in stops:
104
+ if not s:
105
+ continue
106
+ idx = text.find(s)
107
+ if idx != -1:
108
+ cut = min(cut, idx)
109
+ return text[:cut]
110
+
111
+
112
+ def generate_once(prompt: str, temperature: float, top_p: float, max_new_tokens: int) -> Dict[str, Any]:
113
+ assert tokenizer is not None and model is not None, "Model not loaded"
114
+
115
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
116
+ gen_ids = model.generate(
117
+ **inputs,
118
+ max_new_tokens=max_new_tokens,
119
+ do_sample=True if temperature and temperature > 0 else False,
120
+ temperature=max(0.0, float(temperature or 0.0)),
121
+ top_p=max(0.0, float(top_p or 1.0)),
122
+ pad_token_id=tokenizer.pad_token_id,
123
+ eos_token_id=tokenizer.eos_token_id,
124
+ )
125
+ out = tokenizer.decode(gen_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
126
+ return {
127
+ "text": out,
128
+ "prompt_tokens": inputs["input_ids"].numel(),
129
+ "completion_tokens": gen_ids[0].shape[0] - inputs["input_ids"].shape[-1],
130
+ }
131
+
132
+
133
+ @app.get("/")
134
+ def root():
135
+ return RedirectResponse(url="/docs")
136
+
137
+
138
+ @app.get("/health")
139
+ def health():
140
+ return {"status": "ok", "model": MODEL_ID}
141
+
142
+
143
+ @app.post("/v1/chat/completions")
144
+ def chat_completions(req: ChatCompletionRequest):
145
+ if req.n and req.n > 1:
146
+ raise HTTPException(status_code=400, detail="Only n=1 is supported in this simple server.")
147
+ max_new = req.max_tokens or DEFAULT_MAX_TOKENS
148
+
149
+ prompt = build_chat_prompt(req.messages)
150
+ g = generate_once(prompt, req.temperature or 0.7, req.top_p or 0.95, max_new)
151
+ text = apply_stop_sequences(g["text"], req.stop)
152
+
153
+ created = int(time.time())
154
+ comp_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
155
+
156
+ usage = Usage(
157
+ prompt_tokens=g["prompt_tokens"],
158
+ completion_tokens=g["completion_tokens"],
159
+ total_tokens=g["prompt_tokens"] + g["completion_tokens"],
160
+ )
161
+
162
+ return {
163
+ "id": comp_id,
164
+ "object": "chat.completion",
165
+ "created": created,
166
+ "model": req.model or MODEL_ID,
167
+ "choices": [
168
+ {
169
+ "index": 0,
170
+ "message": {"role": "assistant", "content": text},
171
+ "finish_reason": "stop",
172
+ }
173
+ ],
174
+ "usage": usage.dict(),
175
+ }
176
+
177
+
178
+ @app.post("/v1/completions")
179
+ def completions(req: CompletionRequest):
180
+ if req.n and req.n > 1:
181
+ raise HTTPException(status_code=400, detail="Only n=1 is supported in this simple server.")
182
+
183
+ prompts = req.prompt if isinstance(req.prompt, list) else [req.prompt]
184
+ if len(prompts) != 1:
185
+ raise HTTPException(status_code=400, detail="Only a single prompt is supported in this simple server.")
186
+
187
+ max_new = req.max_tokens or DEFAULT_MAX_TOKENS
188
+
189
+ g = generate_once(prompts[0], req.temperature or 0.7, req.top_p or 0.95, max_new)
190
+ text = apply_stop_sequences(g["text"], req.stop)
191
+
192
+ created = int(time.time())
193
+ comp_id = f"cmpl-{uuid.uuid4().hex[:24]}"
194
+
195
+ usage = Usage(
196
+ prompt_tokens=g["prompt_tokens"],
197
+ completion_tokens=g["completion_tokens"],
198
+ total_tokens=g["prompt_tokens"] + g["completion_tokens"],
199
+ )
200
+
201
+ return {
202
+ "id": comp_id,
203
+ "object": "text_completion",
204
+ "created": created,
205
+ "model": req.model or MODEL_ID,
206
+ "choices": [
207
+ {
208
+ "index": 0,
209
+ "text": text,
210
+ "finish_reason": "stop",
211
+ "logprobs": None,
212
+ }
213
+ ],
214
+ "usage": usage.dict(),
215
+ }
216
+
217
+
218
+ if __name__ == "__main__":
219
+ import uvicorn
220
+
221
+ port = int(os.getenv("PORT", "7860"))
222
+ uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn>=0.29.0
3
+ transformers>=4.41.0
4
+ torch
5
+ accelerate>=0.30.0
6
+ sentencepiece>=0.2.0
7
+ safetensors>=0.4.3
8
+ pydantic>=2.5.0