from fastapi import FastAPI import uvicorn from awq import AutoAWQForCausalLM from transformers import AutoTokenizer model_name = "TheBloke/Guanaco-7B-Uncensored-AWQ" model = AutoAWQForCausalLM.from_quantized(model_name, fuse_layers=True, trust_remote_code=False, safetensors=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False) app = FastAPI() @app.get("/") def greet_json(): return {"Hello": "World!"} @app.get("/message") async def message(input: str): prompt=f'''### Human: {input} ### Assistant: ''' inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids.cpu() output = model.generate( inputs, do_sample=True, temperature=0.7, top_p=0.95, top_k=40, max_new_tokens=512 ) response = tokenizer.decode(output[0], skip_special_tokens=True) return response if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)