ndc8 commited on
Commit
6e96e6e
·
1 Parent(s): 8a3c5dd

Refactor application to implement GGUF backend with native transformers support; update requirements and add GGUF-specific entry point

Browse files
app.py CHANGED
@@ -1,11 +1,11 @@
1
  #!/usr/bin/env python3
2
  """
3
- Entry point for Hugging Face Spaces - Lightweight Backend
4
- This file imports and runs the lightweight FastAPI application optimized for CPU and memory constraints
5
  """
6
 
7
- # Import the lightweight backend instead of GGUF backend
8
- from lightweight_backend import app
9
 
10
  if __name__ == "__main__":
11
  import uvicorn
 
1
  #!/usr/bin/env python3
2
  """
3
+ Entry point for Hugging Face Spaces - GGUF Backend
4
+ This file imports and runs the GGUF FastAPI application with native transformers GGUF support
5
  """
6
 
7
+ # Import the GGUF backend with native transformers support
8
+ from gguf_transformers_backend import app
9
 
10
  if __name__ == "__main__":
11
  import uvicorn
gguf_transformers_backend.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GGUF Backend with Native Transformers Support
4
+ Uses transformers library's built-in GGUF loading (no llama-cpp-python needed)
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ from contextlib import asynccontextmanager
10
+ from typing import List, Dict, Any, Optional
11
+ import uuid
12
+ import time
13
+
14
+ from fastapi import FastAPI, HTTPException
15
+ from fastapi.responses import JSONResponse
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+ from pydantic import BaseModel, Field, field_validator
18
+
19
+ # Import transformers with GGUF support
20
+ import torch
21
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
22
+
23
+ # Configure logging
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Pydantic models for OpenAI-compatible API
28
+ class ChatMessage(BaseModel):
29
+ role: str = Field(..., description="The role of the message author")
30
+ content: str = Field(..., description="The content of the message")
31
+
32
+ @field_validator('role')
33
+ @classmethod
34
+ def validate_role(cls, v: str) -> str:
35
+ if v not in ["system", "user", "assistant"]:
36
+ raise ValueError("Role must be one of: system, user, assistant")
37
+ return v
38
+
39
+ class ChatCompletionRequest(BaseModel):
40
+ model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
41
+ messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
42
+ max_tokens: Optional[int] = Field(default=256, ge=1, le=1024, description="Maximum tokens to generate")
43
+ temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
44
+ top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
45
+ stream: Optional[bool] = Field(default=False, description="Whether to stream responses")
46
+
47
+ class ChatCompletionChoice(BaseModel):
48
+ index: int
49
+ message: ChatMessage
50
+ finish_reason: str
51
+
52
+ class ChatCompletionResponse(BaseModel):
53
+ id: str
54
+ object: str = "chat.completion"
55
+ created: int
56
+ model: str
57
+ choices: List[ChatCompletionChoice]
58
+
59
+ class HealthResponse(BaseModel):
60
+ status: str
61
+ model: str
62
+ version: str
63
+ backend: str
64
+ quantization: str
65
+
66
+ # Global variables for model management
67
+ current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
68
+ gguf_filename = os.environ.get("GGUF_FILE", "*Q4_K_M.gguf")
69
+ tokenizer = None
70
+ model = None
71
+ text_pipeline = None
72
+
73
+ @asynccontextmanager
74
+ async def lifespan(app: FastAPI):
75
+ """Application lifespan manager with GGUF model loading via transformers"""
76
+ global tokenizer, model, text_pipeline
77
+
78
+ logger.info("🚀 Starting GGUF Backend Service (Transformers Native)")
79
+
80
+ if os.environ.get("DEMO_MODE", "").strip() not in ("", "0", "false", "False"):
81
+ logger.info("🧪 DEMO_MODE enabled: skipping model load")
82
+ yield
83
+ logger.info("🔄 Shutting down GGUF Backend Service (demo mode)...")
84
+ return
85
+
86
+ try:
87
+ logger.info(f"📥 Loading GGUF model: {current_model}")
88
+ logger.info(f"🎯 GGUF file pattern: {gguf_filename}")
89
+
90
+ # Load tokenizer first
91
+ tokenizer = AutoTokenizer.from_pretrained(
92
+ current_model,
93
+ trust_remote_code=True,
94
+ use_fast=True
95
+ )
96
+
97
+ # Ensure pad token exists
98
+ if tokenizer.pad_token is None:
99
+ tokenizer.pad_token = tokenizer.eos_token
100
+
101
+ # Load GGUF model using native transformers support
102
+ logger.info("⚙️ Loading GGUF model with transformers native support...")
103
+ model = AutoModelForCausalLM.from_pretrained(
104
+ current_model,
105
+ gguf_file=gguf_filename, # Key parameter for GGUF loading
106
+ torch_dtype=torch.float32, # CPU-compatible
107
+ device_map="auto", # Let transformers handle device placement
108
+ low_cpu_mem_usage=True, # Memory optimization
109
+ trust_remote_code=True,
110
+ )
111
+
112
+ # Create pipeline for efficient generation
113
+ text_pipeline = pipeline(
114
+ "text-generation",
115
+ model=model,
116
+ tokenizer=tokenizer,
117
+ max_new_tokens=256,
118
+ do_sample=True,
119
+ temperature=1.0,
120
+ top_p=0.95,
121
+ pad_token_id=tokenizer.eos_token_id,
122
+ )
123
+
124
+ logger.info("✅ Successfully loaded GGUF model with transformers")
125
+ logger.info(f"📊 Model: {current_model}")
126
+ logger.info(f"🔧 GGUF File: {gguf_filename}")
127
+ logger.info(f"🧠 Backend: Transformers native GGUF support")
128
+
129
+ except Exception as e:
130
+ logger.error(f"❌ Failed to initialize GGUF model: {e}")
131
+ logger.info("🔄 Starting service in demo mode")
132
+ model = None
133
+ tokenizer = None
134
+ text_pipeline = None
135
+
136
+ yield
137
+
138
+ logger.info("🔄 Shutting down GGUF Backend Service...")
139
+ # Clean up model resources
140
+ if model:
141
+ del model
142
+ if tokenizer:
143
+ del tokenizer
144
+ if text_pipeline:
145
+ del text_pipeline
146
+
147
+ # Initialize FastAPI app
148
+ app = FastAPI(
149
+ title="GGUF Backend Service (Transformers Native)",
150
+ description="Memory-efficient GGUF model API using transformers native support",
151
+ version="1.0.0",
152
+ lifespan=lifespan
153
+ )
154
+
155
+ # Configure CORS
156
+ app.add_middleware(
157
+ CORSMiddleware,
158
+ allow_origins=["*"],
159
+ allow_credentials=True,
160
+ allow_methods=["*"],
161
+ allow_headers=["*"],
162
+ )
163
+
164
+ def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
165
+ """Convert OpenAI messages format to Gemma 3n chat format."""
166
+ prompt_parts = []
167
+
168
+ for message in messages:
169
+ role = message.role
170
+ content = message.content.strip()
171
+
172
+ if role == "system":
173
+ prompt_parts.append(f"<start_of_turn>system\n{content}<end_of_turn>")
174
+ elif role == "user":
175
+ prompt_parts.append(f"<start_of_turn>user\n{content}<end_of_turn>")
176
+ elif role == "assistant":
177
+ prompt_parts.append(f"<start_of_turn>model\n{content}<end_of_turn>")
178
+
179
+ # Add the start for model response
180
+ prompt_parts.append("<start_of_turn>model\n")
181
+
182
+ return "\n".join(prompt_parts)
183
+
184
+ def generate_response(messages: List[ChatMessage], max_tokens: int = 256, temperature: float = 1.0, top_p: float = 0.95) -> str:
185
+ """Generate response using GGUF model via transformers pipeline."""
186
+ if text_pipeline is None:
187
+ return "🤖 Demo mode: GGUF model not loaded. This would be a real response from the Gemma 3n GGUF model."
188
+
189
+ try:
190
+ # Convert messages to prompt
191
+ prompt = convert_messages_to_prompt(messages)
192
+
193
+ # Limit max_tokens for memory efficiency
194
+ max_tokens = min(max_tokens, 512)
195
+
196
+ # Generate response
197
+ result = text_pipeline(
198
+ prompt,
199
+ max_new_tokens=max_tokens,
200
+ temperature=temperature,
201
+ top_p=top_p,
202
+ do_sample=True,
203
+ return_full_text=False,
204
+ pad_token_id=tokenizer.eos_token_id,
205
+ )
206
+
207
+ # Extract generated text
208
+ if result and len(result) > 0:
209
+ response_text = result[0]['generated_text'].strip()
210
+ # Clean up any unwanted tokens
211
+ if "<end_of_turn>" in response_text:
212
+ response_text = response_text.split("<end_of_turn>")[0].strip()
213
+ return response_text
214
+ else:
215
+ return "I apologize, but I'm having trouble generating a response right now."
216
+
217
+ except Exception as e:
218
+ logger.error(f"GGUF generation failed: {e}")
219
+ return "I apologize, but I'm having trouble generating a response right now. Please try again."
220
+
221
+ @app.get("/", response_class=JSONResponse)
222
+ async def root() -> Dict[str, Any]:
223
+ """Root endpoint with service information"""
224
+ return {
225
+ "service": "GGUF Backend Service",
226
+ "version": "1.0.0",
227
+ "model": current_model,
228
+ "gguf_file": gguf_filename,
229
+ "backend": "transformers-native-gguf",
230
+ "quantization": "Q4_K_M",
231
+ "endpoints": {
232
+ "health": "/health",
233
+ "chat": "/v1/chat/completions",
234
+ "docs": "/docs"
235
+ }
236
+ }
237
+
238
+ @app.get("/health", response_model=HealthResponse)
239
+ async def health_check():
240
+ """Health check endpoint"""
241
+ status = "healthy" if text_pipeline is not None else "demo_mode"
242
+
243
+ return HealthResponse(
244
+ status=status,
245
+ model=current_model,
246
+ version="1.0.0",
247
+ backend="transformers-native-gguf",
248
+ quantization="Q4_K_M"
249
+ )
250
+
251
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
252
+ async def create_chat_completion(request: ChatCompletionRequest) -> ChatCompletionResponse:
253
+ """Create a chat completion (OpenAI-compatible) using GGUF model"""
254
+
255
+ try:
256
+ # Generate response
257
+ response_text = generate_response(
258
+ messages=request.messages,
259
+ max_tokens=request.max_tokens or 256,
260
+ temperature=request.temperature or 1.0,
261
+ top_p=request.top_p or 0.95
262
+ )
263
+
264
+ # Create response message
265
+ response_message = ChatMessage(role="assistant", content=response_text)
266
+
267
+ # Create choice
268
+ choice = ChatCompletionChoice(
269
+ index=0,
270
+ message=response_message,
271
+ finish_reason="stop"
272
+ )
273
+
274
+ # Create completion response
275
+ completion = ChatCompletionResponse(
276
+ id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
277
+ object="chat.completion",
278
+ created=int(time.time()),
279
+ model=request.model,
280
+ choices=[choice]
281
+ )
282
+
283
+ return completion
284
+
285
+ except Exception as e:
286
+ logger.error(f"Chat completion failed: {e}")
287
+ raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
288
+
289
+ if __name__ == "__main__":
290
+ import uvicorn
291
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt CHANGED
@@ -1,17 +1,16 @@
1
 
2
 
3
- # Hugging Face Spaces requirements (Lightweight CPU-optimized backend)
4
  fastapi
5
  uvicorn
6
  python-dotenv
7
  httpx
8
  requests
9
 
10
- # Lightweight transformers for CPU-only inference (much smaller than full transformers)
11
- transformers>=4.46.0
12
  torch>=2.0.0
13
  accelerate
14
- # Note: BitsAndBytesConfig requires CUDA, so we use CPU optimizations instead
15
 
16
  # Optional: gradio for demo UI
17
  # gradio
 
1
 
2
 
3
+ # Hugging Face Spaces requirements (GGUF with Native Transformers Support)
4
  fastapi
5
  uvicorn
6
  python-dotenv
7
  httpx
8
  requests
9
 
10
+ # Transformers with native GGUF support (4.45+ has this feature)
11
+ transformers>=4.45.0
12
  torch>=2.0.0
13
  accelerate
 
14
 
15
  # Optional: gradio for demo UI
16
  # gradio
requirements_gguf.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces requirements (GGUF with Native Transformers Support)
2
+ fastapi
3
+ uvicorn
4
+ python-dotenv
5
+ httpx
6
+ requests
7
+
8
+ # Transformers with native GGUF support (4.45+ has this feature)
9
+ transformers>=4.45.0
10
+ torch>=2.0.0
11
+ accelerate
12
+
13
+ # Optional: gradio for demo UI
14
+ # gradio