File size: 2,267 Bytes
0218c20
03991d8
 
 
0218c20
 
29331bd
6a547e4
29331bd
6a547e4
 
 
29331bd
6a547e4
0218c20
 
 
 
6a547e4
 
0218c20
03991d8
6a547e4
03991d8
 
6a547e4
03991d8
 
0218c20
03991d8
 
 
 
 
6a547e4
03991d8
 
 
6a547e4
0218c20
 
6a547e4
03991d8
0218c20
 
 
 
 
 
6a547e4
0218c20
 
6a547e4
0218c20
 
 
 
 
 
 
 
 
6a547e4
0218c20
 
 
6a547e4
 
 
 
03991d8
6a547e4
03991d8
 
6a547e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import asyncio

# βœ… Set a safe and writable HF cache directory
os.environ["HF_HOME"] = "./hf_home"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)

# βœ… Model and tokenizer (only loaded once)
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# βœ… Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# βœ… FastAPI app
app = FastAPI()

# βœ… CORS settings
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# βœ… Request schema
class Question(BaseModel):
    question: str

# βœ… System prompt
SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."

# βœ… Streaming generator
async def generate_response_chunks(prompt: str):
    qwen_prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{prompt}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    # Tokenize prompt
    inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)

    # Generate output
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = full_output.split("<|im_start|>assistant\n")[-1].strip()

    # Yield chunks word by word (simulating stream)
    for word in reply.split():
        yield word + " "
        await asyncio.sleep(0.01)  # slight delay for streaming effect

# βœ… POST endpoint
@app.post("/ask")
async def ask(question: Question):
    return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain")