Spaces:

AlphaPhoenix
/

MATRIX

Sleeping

File size: 1,794 Bytes

dea3a07
2fc7e1b
861971b
a751c84
861971b
2fc7e1b
dea3a07
2fc7e1b
 
e604a26
861971b
 
af0df21
2fc7e1b
 
af0df21
a751c84
 
 
 
 
 
 
 
2fc7e1b
 
 
 
 
32dbfef
a751c84
2fc7e1b
 
 
 
 
e604a26
861971b
 
 
 
 
 
2fc7e1b
861971b
 
 
af0df21
2fc7e1b
861971b
2fc7e1b
 
861971b

import os
import torch
from fastapi import FastAPI
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from pydantic import BaseModel
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI()

model_name = "google/gemma-2-2b-it"
try:
    logger.info(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv("HF_TOKEN"))
    use_gpu = torch.cuda.is_available()
    logger.info(f"GPU available: {use_gpu}")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    ) if use_gpu else None
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=os.getenv("HF_TOKEN"),
        low_cpu_mem_usage=True,
        quantization_config=quantization_config
    )
    logger.info("Model loaded successfully")
except Exception as e:
    logger.error(f"Model load error: {e}")
    raise

class TextInput(BaseModel):
    text: str
    max_length: int = 50

@app.post("/generate")
async def generate_text(input: TextInput):
    try:
        logger.info(f"Generating text for input: {input.text}")
        inputs = tokenizer(input.text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        outputs = model.generate(**inputs, max_length=input.max_length)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        logger.info(f"Generated text: {result}")
        return {"generated_text": result}
    except Exception as e:
        logger.error(f"Generation error: {e}")
        return {"error": str(e)}