File size: 30,847 Bytes
4e10023
375ade4
 
4e10023
 
83df634
97bafdb
 
 
 
 
 
 
255b6fc
83df634
255b6fc
 
68f41f4
4e10023
 
 
 
1ba257c
4e10023
 
 
 
 
8d9c495
4e10023
 
 
375ade4
 
 
 
 
 
 
 
 
 
 
8d9c495
4e10023
375ade4
db8cd85
172b424
 
4e10023
 
 
 
172b424
 
 
 
 
 
 
 
 
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375ade4
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9c495
375ade4
 
 
8208c22
375ade4
 
 
 
 
4e10023
8d9c495
 
4e10023
172b424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9c495
1ba257c
4e10023
 
 
375ade4
4e10023
375ade4
 
 
 
4e10023
375ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c9134e
375ade4
cb5d5f8
375ade4
 
 
 
cb5d5f8
375ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb5d5f8
 
375ade4
 
 
cb5d5f8
 
 
375ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db8cd85
375ade4
 
 
 
 
 
 
 
 
db8cd85
 
375ade4
db8cd85
 
 
375ade4
db8cd85
375ade4
 
 
db8cd85
 
 
375ade4
 
db8cd85
375ade4
db8cd85
375ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba257c
8208c22
1ba257c
 
 
 
 
 
 
 
4e10023
1ba257c
4e10023
1ba257c
4e10023
 
 
 
8d9c495
4e10023
 
 
 
0c9134e
 
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
8d9c495
 
375ade4
 
 
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9c495
 
375ade4
8d9c495
375ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e10023
1ba257c
8d9c495
 
1ba257c
 
 
 
8d9c495
 
 
 
 
 
4e10023
 
1ba257c
 
4e10023
1ba257c
 
4e10023
1ba257c
 
 
4e10023
 
375ade4
1ba257c
 
4e10023
 
 
 
 
0c9134e
 
4e10023
 
 
 
 
 
 
 
 
 
 
1ba257c
4e10023
1ba257c
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f41f4
e46cec3
68f41f4
8d9c495
68f41f4
4e10023
8d9c495
68f41f4
 
4e10023
 
 
 
 
 
 
 
 
8d9c495
68f41f4
8d9c495
 
68f41f4
 
 
 
4e10023
68f41f4
4e10023
 
 
68f41f4
 
 
 
 
4e10023
 
 
 
 
84eb396
8d9c495
4e10023
 
8d9c495
4e10023
 
 
 
 
8d9c495
 
 
4e10023
8d9c495
 
4e10023
 
68f41f4
4e10023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09c9042
68f41f4
09c9042
 
 
 
68f41f4
09c9042
 
 
68f41f4
09c9042
 
 
2cd680b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
"""
FastAPI Backend AI Service using Gemma-3n-E4B-it-GGUF
Provides OpenAI-compatible chat completion endpoints powered by unsloth/gemma-3n-E4B-it-GGUF
"""

import os
import warnings

# Suppress warnings before any other imports
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
warnings.filterwarnings("ignore", message=".*slow image processor.*")
warnings.filterwarnings("ignore", message=".*rope_scaling.*")

# Direct Hugging Face caches to a writable folder under /tmp (use only HF_HOME, TRANSFORMERS_CACHE is deprecated)
os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
# Suppress advisory warnings from transformers (including deprecation warnings)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
hf_token = os.environ.get("HF_TOKEN")
import asyncio
import logging
import time
from contextlib import asynccontextmanager
from typing import List, Dict, Any, Optional, Union

from fastapi import FastAPI, HTTPException, Depends, Request
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, field_validator

import uvicorn
import requests
from PIL import Image

# Import llama-cpp-python for GGUF model support
try:
    from llama_cpp import Llama
    llama_cpp_available = True
    logger = logging.getLogger(__name__)
    logger.info("βœ… llama-cpp-python support available")
except ImportError:
    llama_cpp_available = False

# Keep transformers imports as fallback
from transformers import AutoTokenizer, AutoModelForCausalLM

# Transformers imports (now fallback for non-GGUF models)
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig  # type: ignore
from transformers import BitsAndBytesConfig  # type: ignore
import torch
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check for optional quantization support
try:
    import bitsandbytes as bnb
    quantization_available = True
    logger.info("βœ… BitsAndBytes quantization support available")
except ImportError:
    quantization_available = False
    logger.warning("⚠️ BitsAndBytes not available - 4-bit models will use standard loading")

# Pydantic models for multimodal content
class TextContent(BaseModel):
    type: str = Field(default="text", description="Content type")
    text: str = Field(..., description="Text content")
    
    @field_validator('type')
    @classmethod
    def validate_type(cls, v: str) -> str:
        if v != "text":
            raise ValueError("Type must be 'text'")
        return v

class ImageContent(BaseModel):
    type: str = Field(default="image", description="Content type")
    url: str = Field(..., description="Image URL")
    
    @field_validator('type')
    @classmethod
    def validate_type(cls, v: str) -> str:
        if v != "image":
            raise ValueError("Type must be 'image'")
        return v

# Pydantic models for OpenAI-compatible API
class ChatMessage(BaseModel):
    role: str = Field(..., description="The role of the message author")
    content: Union[str, List[Union[TextContent, ImageContent]]] = Field(..., description="The content of the message - either string or list of content items")
    
    @field_validator('role')
    @classmethod
    def validate_role(cls, v: str) -> str:
        if v not in ["system", "user", "assistant"]:
            raise ValueError("Role must be one of: system, user, assistant")
        return v

class ChatCompletionRequest(BaseModel):
    model: str = Field(default_factory=lambda: os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF"), description="The model to use for completion")
    messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
    max_tokens: Optional[int] = Field(default=512, ge=1, le=2048, description="Maximum tokens to generate")
    temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0, description="Sampling temperature")
    stream: Optional[bool] = Field(default=False, description="Whether to stream responses")
    top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")

class ChatCompletionChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: str

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[ChatCompletionChoice]

class ChatCompletionChunk(BaseModel):
    id: str
    object: str = "chat.completion.chunk"
    created: int
    model: str
    choices: List[Dict[str, Any]]

class HealthResponse(BaseModel):
    status: str
    model: str
    version: str

class ModelInfo(BaseModel):
    id: str
    object: str = "model"
    created: int
    owned_by: str = "huggingface"

class ModelsResponse(BaseModel):
    object: str = "list"
    data: List[ModelInfo]

class CompletionRequest(BaseModel):
    prompt: str = Field(..., description="The prompt to complete")
    max_tokens: Optional[int] = Field(default=512, ge=1, le=2048)
    temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)


# Global variables for model management (supporting both GGUF and transformers)
# Model can be configured via environment variable - defaults to Gemma 3n GGUF
current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
vision_model = os.environ.get("VISION_MODEL", "Salesforce/blip-image-captioning-base")

# GGUF model support (llama-cpp-python)
llm = None

# Transformers model support (fallback)
tokenizer = None
model = None
image_text_pipeline = None  # type: ignore

def get_quantization_config(model_name: str):
    """Get quantization config for 4-bit models"""
    if not quantization_available:
        return None
    
    # Check if this is a 4-bit model that should use quantization
    is_4bit_model = (
        "4bit" in model_name.lower() or 
        "bnb" in model_name.lower() or
        "unsloth" in model_name.lower()
    )
    
    if is_4bit_model:
        logger.info(f"πŸ”§ Configuring 4-bit quantization for {model_name}")
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
    
    return None

# Image processing utilities
async def download_image(url: str) -> Image.Image:
    """Download and process image from URL"""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        image = Image.open(requests.compat.BytesIO(response.content))  # type: ignore
        return image
    except Exception as e:
        logger.error(f"Failed to download image from {url}: {e}")
        raise HTTPException(status_code=400, detail=f"Failed to download image: {str(e)}")

def extract_text_and_images(content: Union[str, List[Any]]) -> tuple[str, List[str]]:
    """Extract text and image URLs from message content"""
    if isinstance(content, str):
        return content, []
    
    text_parts: List[str] = []
    image_urls: List[str] = []
    
    for item in content:
        if hasattr(item, 'type'):
            if item.type == "text" and hasattr(item, 'text'):
                text_parts.append(str(item.text))
            elif item.type == "image" and hasattr(item, 'url'):
                image_urls.append(str(item.url))
    
    return " ".join(text_parts), image_urls

def has_images(messages: List[ChatMessage]) -> bool:
    """Check if any messages contain images"""
    for message in messages:
        if isinstance(message.content, list):
            for item in message.content:
                if hasattr(item, 'type') and item.type == "image":
                    return True
    return False



@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan manager for startup and shutdown events"""
    global tokenizer, model, image_text_pipeline, llm, current_model
    logger.info("πŸš€ Starting AI Backend Service...")
    
    # Check if this is a GGUF model that should use llama-cpp-python
    is_gguf_model = "gguf" in current_model.lower() or "gemma-3n" in current_model.lower()
    
    try:
        if is_gguf_model and llama_cpp_available:
            logger.info(f"πŸ“₯ Loading GGUF model with llama-cpp-python: {current_model}")
            
            # Load Gemma 3n GGUF model using llama-cpp-python
            try:
                llm = Llama.from_pretrained(
                    repo_id=current_model,
                    filename="*Q4_K_M.gguf",  # Use exact filename pattern from available files
                    verbose=True,
                    # Gemma 3n specific settings
                    n_ctx=4096,  # Start with 4K context, can be increased to 32K
                    n_threads=4,  # Adjust based on CPU cores
                    n_gpu_layers=-1,  # Use all GPU layers if CUDA available
                    # Chat format for Gemma 3n
                    chat_format="gemma",  # Use built-in gemma format
                )
                logger.info("βœ… Successfully loaded Gemma 3n GGUF model")
                
            except Exception as gguf_error:
                logger.warning(f"⚠️ GGUF model loading failed: {gguf_error}")
                logger.info("πŸ’‘ Please ensure you have downloaded the GGUF model file locally")
                logger.info("πŸ’‘ Visit: https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF")
                
                # For now, we'll continue with transformers fallback
                is_gguf_model = False
        
        # Fallback to transformers if GGUF loading failed or not available
        if not is_gguf_model or not llama_cpp_available:
            logger.info(f"πŸ“₯ Loading model with transformers: {current_model}")
            
            # Load tokenizer and model directly from HuggingFace repo (standard transformers format)
            logger.info(f"πŸ“₯ Loading tokenizer from {current_model}...")
            tokenizer = AutoTokenizer.from_pretrained(current_model)
            
            # Get quantization config if needed
            quantization_config = get_quantization_config(current_model)
            
            logger.info(f"πŸ“₯ Loading model from {current_model}...")
            try:
                if quantization_config:
                    logger.info("πŸ”§ Attempting 4-bit quantization")
                    model = AutoModelForCausalLM.from_pretrained(
                        current_model,
                        quantization_config=quantization_config,
                        device_map="auto",
                        torch_dtype=torch.bfloat16,
                        low_cpu_mem_usage=True,
                        trust_remote_code=True,
                    )
                else:
                    logger.info("πŸ“₯ Using standard model loading with optimized settings")
                    model = AutoModelForCausalLM.from_pretrained(
                        current_model,
                        torch_dtype=torch.bfloat16,
                        device_map="auto",
                        low_cpu_mem_usage=True,
                        trust_remote_code=True,
                    )
            except Exception as quant_error:
                if ("CUDA" in str(quant_error) or 
                    "bitsandbytes" in str(quant_error) or 
                    "PackageNotFoundError" in str(quant_error) or
                    "No package metadata was found for bitsandbytes" in str(quant_error)):
                    
                    logger.warning(f"⚠️ Quantization failed - bitsandbytes not available or no CUDA: {quant_error}")
                    logger.info("πŸ”„ Falling back to standard model loading, ignoring pre-quantized config")
                    
                    # For pre-quantized models, we need to load config first and remove quantization
                    try:
                        logger.info("πŸ”§ Loading model config to remove quantization settings")
                        
                        config = AutoConfig.from_pretrained(current_model, trust_remote_code=True)
                        
                        # Remove any quantization configuration from the config
                        if hasattr(config, 'quantization_config'):
                            logger.info("🚫 Removing quantization_config from model config")
                            config.quantization_config = None
                        
                        model = AutoModelForCausalLM.from_pretrained(
                            current_model,
                            config=config,
                            torch_dtype=torch.float16,
                            low_cpu_mem_usage=True,
                            trust_remote_code=True,
                            device_map="cpu",  # Force CPU when quantization fails
                        )
                    except Exception as fallback_error:
                        logger.warning(f"⚠️ Config-based loading failed: {fallback_error}")
                        logger.info("πŸ”„ Trying standard loading without quantization config")
                        try:
                            model = AutoModelForCausalLM.from_pretrained(
                                current_model,
                                torch_dtype=torch.float16,
                                low_cpu_mem_usage=True,
                                trust_remote_code=True,
                                device_map="cpu",
                            )
                        except Exception as standard_error:
                            logger.warning(f"⚠️ Standard loading also failed: {standard_error}")
                            logger.info("πŸ”„ Trying with minimal configuration - bypassing all quantization")
                            # Ultimate fallback: Load without any custom config
                            try:
                                model = AutoModelForCausalLM.from_pretrained(
                                    current_model,
                                    trust_remote_code=True,
                                )
                            except Exception as minimal_error:
                                logger.warning(f"⚠️ Minimal loading also failed: {minimal_error}")
                                logger.info("πŸ”„ Final fallback: Using deployment-friendly default model")
                                # If this specific model absolutely cannot load, fallback to a reliable alternative
                                fallback_model = "microsoft/DialoGPT-medium"
                                logger.info(f"πŸ“₯ Loading fallback model: {fallback_model}")
                                tokenizer = AutoTokenizer.from_pretrained(fallback_model)
                                model = AutoModelForCausalLM.from_pretrained(fallback_model)
                                logger.info(f"βœ… Successfully loaded fallback model: {fallback_model}")
                                # Update current_model to reflect what we actually loaded
                                current_model = fallback_model
                else:
                    raise quant_error
        
        logger.info(f"βœ… Successfully loaded model and tokenizer: {current_model}")
        
        # Load image pipeline for multimodal support
        try:
            logger.info(f"πŸ–ΌοΈ Initializing image captioning pipeline with model: {vision_model}")
            image_text_pipeline = pipeline("image-to-text", model=vision_model)
            logger.info("βœ… Image captioning pipeline loaded successfully")
        except Exception as e:
            logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
            image_text_pipeline = None
            
    except Exception as e:
        logger.error(f"❌ Failed to initialize model: {e}")
        raise RuntimeError(f"Service initialization failed: {e}")
    yield
    logger.info("πŸ”„ Shutting down AI Backend Service...")
    tokenizer = None
    model = None
    image_text_pipeline = None

# Initialize FastAPI app
app = FastAPI(
    title="AI Backend Service - Mistral Nemo",
    description="OpenAI-compatible chat completion API powered by unsloth/Mistral-Nemo-Instruct-2407",
    version="1.0.0",
    lifespan=lifespan
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


def ensure_model_ready():
    """Check if either GGUF or transformers model is loaded and ready"""
    if llm is None and (tokenizer is None or model is None):
        raise HTTPException(status_code=503, detail="Service not ready - no model initialized (neither GGUF nor transformers)")

def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
    """Convert OpenAI messages format to a single prompt string"""
    prompt_parts: List[str] = []
    
    for message in messages:
        role = message.role
        
        # Extract text content (handle both string and list formats)
        if isinstance(message.content, str):
            content = message.content
        else:
            content, _ = extract_text_and_images(message.content)
        
        if role == "system":
            prompt_parts.append(f"System: {content}")
        elif role == "user":
            prompt_parts.append(f"Human: {content}")
        elif role == "assistant":
            prompt_parts.append(f"Assistant: {content}")
    
    # Add assistant prompt to continue
    prompt_parts.append("Assistant:")
    
    return "\n".join(prompt_parts)

async def generate_multimodal_response(
    messages: List[ChatMessage],
    request: ChatCompletionRequest
) -> str:
    """Generate response using image-text-to-text pipeline for multimodal content"""
    if not image_text_pipeline:
        raise HTTPException(status_code=503, detail="Image processing not available - pipeline not initialized")
    
    try:
        # Find the last user message with images
        last_user_message = None
        for message in reversed(messages):
            if message.role == "user" and isinstance(message.content, list):
                last_user_message = message
                break
        
        if not last_user_message:
            raise HTTPException(status_code=400, detail="No user message with images found")
        
        # Extract text and images from the message
        text_content, image_urls = extract_text_and_images(last_user_message.content)
        
        if not image_urls:
            raise HTTPException(status_code=400, detail="No images found in the message")
        
        # Use the first image for now (could be extended to handle multiple images)
        image_url = image_urls[0]
        
        # Generate response using the image-to-text pipeline
        logger.info(f"πŸ–ΌοΈ Processing image: {image_url}")
        try:
            # Use the pipeline directly with the image URL (no messages format needed for image-to-text)
            result = await asyncio.to_thread(lambda: image_text_pipeline(image_url))  # type: ignore
            
            # Handle response format from image-to-text pipeline
            if result and hasattr(result, '__len__') and len(result) > 0:  # type: ignore
                first_result = result[0]  # type: ignore
                if hasattr(first_result, 'get'):
                    generated_text = first_result.get('generated_text', f'I can see an image at {image_url}.')  # type: ignore
                else:
                    generated_text = str(first_result)
                
                # Combine with user's text question if provided
                if text_content:
                    response = f"Looking at this image, I can see: {generated_text}. "
                    if "what" in text_content.lower() or "?" in text_content:
                        response += f"Regarding your question '{text_content}': Based on what I can see, this appears to be {generated_text.lower()}."
                    else:
                        response += f"You mentioned: {text_content}"
                    return response
                else:
                    return f"I can see: {generated_text}"
            else:
                return f"I can see there's an image at {image_url}, but cannot process it right now."
                
        except Exception as pipeline_error:
            logger.warning(f"Pipeline error: {pipeline_error}")
            return f"I can see there's an image at {image_url}. The image appears to contain visual content that I'm having trouble processing right now."
            
    except Exception as e:
        logger.error(f"Error in multimodal generation: {e}")
        return f"I'm having trouble processing the image. Error: {str(e)}"


def generate_response_local(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
    """Generate response using local model (GGUF or transformers) with chat template."""
    ensure_model_ready()
    
    try:
        # Check if we're using GGUF model (llama-cpp-python)
        if llm is not None:
            logger.info("🦾 Generating response using Gemma 3n GGUF model")
            return generate_response_gguf(messages, max_tokens, temperature, top_p)
        
        # Fallback to transformers model
        logger.info("πŸ€— Generating response using transformers model")
        return generate_response_transformers(messages, max_tokens, temperature, top_p)
        
    except Exception as e:
        logger.error(f"Local generation failed: {e}")
        return "I apologize, but I'm having trouble generating a response right now. Please try again."

def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
    """Generate response using GGUF model via llama-cpp-python."""
    try:
        # Use the chat completion method if available
        if hasattr(llm, 'create_chat_completion'):
            # Convert to dict format for llama-cpp-python
            messages_dict = [{"role": msg.role, "content": msg.content} for msg in messages]
            
            response = llm.create_chat_completion(
                messages=messages_dict,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=64,  # Add top_k for better Gemma 3n performance
                stop=["<end_of_turn>", "<eos>", "</s>"]  # Gemma 3n stop tokens
            )
            
            return response['choices'][0]['message']['content'].strip()
        
        else:
            # Fallback to direct prompt completion
            prompt = convert_messages_to_gemma_prompt(messages)
            
            response = llm(
                prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=64,
                stop=["<end_of_turn>", "<eos>", "</s>"],
                echo=False
            )
            
            return response['choices'][0]['text'].strip()
            
    except Exception as e:
        logger.error(f"GGUF generation failed: {e}")
        return "I apologize, but I'm having trouble generating a response right now. Please try again."

def convert_messages_to_gemma_prompt(messages: List[ChatMessage]) -> str:
    """Convert OpenAI messages format to Gemma 3n chat format."""
    # Gemma 3n uses specific format with <start_of_turn> and <end_of_turn>
    prompt_parts = ["<bos>"]
    
    for message in messages:
        role = message.role
        content = message.content
        
        if role == "system":
            prompt_parts.append(f"<start_of_turn>system\n{content}<end_of_turn>")
        elif role == "user":
            prompt_parts.append(f"<start_of_turn>user\n{content}<end_of_turn>")
        elif role == "assistant":
            prompt_parts.append(f"<start_of_turn>model\n{content}<end_of_turn>")
    
    # Add the start for model response
    prompt_parts.append("<start_of_turn>model\n")
    
    return "\n".join(prompt_parts)

def generate_response_transformers(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
    """Generate response using transformers model with chat template."""
    try:
        # Convert messages to HuggingFace format for chat template
        chat_messages = []
        for m in messages:
            content_str = m.content if isinstance(m.content, str) else extract_text_and_images(m.content)[0]
            chat_messages.append({"role": m.role, "content": content_str})
        
        # Apply chat template exactly as in HuggingFace example
        inputs = tokenizer.apply_chat_template(
            chat_messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
        
        # Move inputs to model device
        inputs = inputs.to(model.device)
        
        # Generate response exactly as in HuggingFace example
        outputs = model.generate(**inputs, max_new_tokens=max_tokens)
        
        # Decode only the newly generated tokens (exclude input)
        generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
        return generated_text.strip()
        
    except Exception as e:
        logger.error(f"Transformers generation failed: {e}")
        return "I apologize, but I'm having trouble generating a response right now. Please try again."


@app.get("/", response_class=JSONResponse)
async def root() -> Dict[str, Any]:
    """Root endpoint with service information"""
    return {
        "message": "AI Backend Service is running with Mistral Nemo!",
        "model": current_model,
        "version": "1.0.0",
        "endpoints": {
            "health": "/health",
            "models": "/v1/models",
            "chat_completions": "/v1/chat/completions"
        }
    }

@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    global current_model, tokenizer, model
    return HealthResponse(
        status="healthy" if (tokenizer is not None and model is not None) else "unhealthy",
        model=current_model,
        version="1.0.0"
    )

@app.get("/v1/models", response_model=ModelsResponse)
async def list_models():
    """List available models (OpenAI-compatible)"""
    
    models = [
        ModelInfo(
            id=current_model,
            created=int(time.time()),
            owned_by="huggingface"
        )
    ]
    
    # Add vision model if available
    if image_text_pipeline:
        models.append(
            ModelInfo(
                id=vision_model,
                created=int(time.time()),
                owned_by="huggingface"
            )
        )
    
    return ModelsResponse(data=models)

        
        # ...existing code...


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(
    request: ChatCompletionRequest
) -> ChatCompletionResponse:
    """Create a chat completion (OpenAI-compatible) with multimodal support."""
    try:
        if not request.messages:
            raise HTTPException(status_code=400, detail="Messages cannot be empty")
        is_multimodal = has_images(request.messages)
        if is_multimodal:
            if not image_text_pipeline:
                raise HTTPException(status_code=503, detail="Image processing not available")
            response_text = await generate_multimodal_response(request.messages, request)
        else:
            logger.info(f"Generating local response for messages: {request.messages}")
            response_text = await asyncio.to_thread(
                generate_response_local,
                request.messages,
                request.max_tokens or 512,
                request.temperature or 0.7,
                request.top_p or 0.95
            )
        response_text = response_text.strip() if response_text else "No response generated."
        return ChatCompletionResponse(
            id=f"chatcmpl-{int(time.time())}",
            created=int(time.time()),
            model=request.model,
            choices=[ChatCompletionChoice(
                index=0,
                message=ChatMessage(role="assistant", content=response_text),
                finish_reason="stop"
            )]
        )
    except Exception as e:
        logger.error(f"Error in chat completion: {e}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")



@app.post("/v1/completions")
async def create_completion(
    request: CompletionRequest
) -> Dict[str, Any]:
    """Create a text completion (OpenAI-compatible)"""
    try:
        if not request.prompt:
            raise HTTPException(status_code=400, detail="Prompt cannot be empty")
        ensure_model_ready()
        # Use the prompt as a single user message
        messages = [ChatMessage(role="user", content=request.prompt)]
        response_text = await asyncio.to_thread(
            generate_response_local,
            messages,
            request.max_tokens or 512,
            request.temperature or 0.7,
            0.95
        )
        return {
            "id": f"cmpl-{int(time.time())}",
            "object": "text_completion",
            "created": int(time.time()),
            "model": current_model,
            "choices": [{
                "text": response_text,
                "index": 0,
                "finish_reason": "stop"
            }]
        }
    except Exception as e:
        logger.error(f"Error in completion: {e}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

@app.post("/api/response")
async def api_response(request: Request) -> JSONResponse:
    """Endpoint to receive and send responses via API."""
    try:
        data = await request.json()
        message = data.get("message", "No message provided")
        return JSONResponse(content={
            "status": "success",
            "received_message": message,
            "response_message": f"You sent: {message}"
        })
    except Exception as e:
        logger.error(f"Error processing API response: {e}")
        raise HTTPException(status_code=500, detail="Internal server error")

# Main entry point moved to the end for proper initialization
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("backend_service:app", host="0.0.0.0", port=8000, reload=True)