from fastapi import FastAPI import uvicorn from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Tap-M/Luna-AI-Llama2-Uncensored" model = AutoModelForCausalLM.from_pretrained( model_name, # Example model device_map="auto", # Auto-distribute across GPU/CPU offload_folder="./offload", # Temporary directory trust_remote_code=True # Required for some models ) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token app = FastAPI() @app.get("/") def greet_json(): return {"Hello": "World!"} @app.get("/message") async def message(input: str): prompt = "USER:" + input + "\nASSISTANT:" inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) output = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=100, ) response = tokenizer.decode(output[0], skip_special_tokens=True) return response if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)