from fastapi import FastAPI from transformers import pipeline from transformers import AutoModelForCausalLM, BitsAndBytesConfig import torch app = FastAPI() bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( "VolkanSimsir/LLaMA-3-8B-GRPO-math-tr", quantization_config=bnb_config, torch_dtype=torch.float16, device_map="auto" ) @app.get("/") def home(): return {"message": "Hello World"} @app.get("/generate") def generate(text: str): output = pipe(text) return {"result": output[0]["generated_text"]}