hostserver2 / main.py
abdullahalioo's picture
Update main.py
0218c20 verified
raw
history blame
2.02 kB
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load Qwen model and tokenizer (once)
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# Set device
device = torch.device("cpu") # Or "cuda" if using GPU
model.to(device)
# FastAPI app
app = FastAPI()
# CORS settings
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request body model
class Question(BaseModel):
question: str
# System prompt (your custom instructions)
SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
# Chat response generator
async def generate_response_chunks(prompt: str):
# Build prompt using Qwen's expected format
qwen_prompt = (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{prompt}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
# Tokenize input
inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
# Generate response
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# Decode and yield line by line
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
for chunk in reply.split():
yield chunk + " "
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(
generate_response_chunks(question.question),
media_type="text/plain"
)