from fastapi import FastAPI from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load Qwen model and tokenizer (once) model_name = "Qwen/Qwen2.5-0.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) # Set device device = torch.device("cpu") # Or "cuda" if using GPU model.to(device) # FastAPI app app = FastAPI() # CORS settings app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Request body model class Question(BaseModel): question: str # System prompt (your custom instructions) SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely." # Chat response generator async def generate_response_chunks(prompt: str): # Build prompt using Qwen's expected format qwen_prompt = ( f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" f"<|im_start|>user\n{prompt}<|im_end|>\n" f"<|im_start|>assistant\n" ) # Tokenize input inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device) # Generate response outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) # Decode and yield line by line full_output = tokenizer.decode(outputs[0], skip_special_tokens=True) reply = full_output.split("<|im_start|>assistant\n")[-1].strip() for chunk in reply.split(): yield chunk + " " @app.post("/ask") async def ask(question: Question): return StreamingResponse( generate_response_chunks(question.question), media_type="text/plain" )