File size: 2,115 Bytes
0218c20
03991d8
 
 
0218c20
 
29331bd
6a547e4
29331bd
463f46a
 
6a547e4
29331bd
463f46a
0218c20
 
 
 
463f46a
6a547e4
0218c20
03991d8
463f46a
03991d8
 
463f46a
03991d8
 
0218c20
03991d8
 
 
 
 
463f46a
03991d8
 
 
463f46a
0218c20
 
463f46a
03991d8
0218c20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a547e4
 
463f46a
03991d8
463f46a
03991d8
 
6a547e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import asyncio

# βœ… Use writable temp dir for Hugging Face cache
os.environ["HF_HOME"] = "/tmp/hf_home"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)

# βœ… Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# βœ… Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# βœ… Initialize FastAPI
app = FastAPI()

# βœ… Enable CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# βœ… Input data model
class Question(BaseModel):
    question: str

# βœ… Instructional system prompt
SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."

# βœ… Streaming response generator
async def generate_response_chunks(prompt: str):
    qwen_prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{prompt}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
    for word in reply.split():
        yield word + " "
        await asyncio.sleep(0.01)

# βœ… API route
@app.post("/ask")
async def ask(question: Question):
    return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain")