abdullahalioo commited on
Commit
463f46a
Β·
verified Β·
1 Parent(s): 6a547e4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +11 -19
main.py CHANGED
@@ -7,23 +7,23 @@ import torch
7
  import os
8
  import asyncio
9
 
10
- # βœ… Set a safe and writable HF cache directory
11
- os.environ["HF_HOME"] = "./hf_home"
12
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
13
 
14
- # βœ… Model and tokenizer (only loaded once)
15
  model_name = "Qwen/Qwen2.5-0.5B-Instruct"
16
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
17
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
18
 
19
- # βœ… Set device (use GPU if available)
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  model.to(device)
22
 
23
- # βœ… FastAPI app
24
  app = FastAPI()
25
 
26
- # βœ… CORS settings
27
  app.add_middleware(
28
  CORSMiddleware,
29
  allow_origins=["*"],
@@ -32,25 +32,21 @@ app.add_middleware(
32
  allow_headers=["*"],
33
  )
34
 
35
- # βœ… Request schema
36
  class Question(BaseModel):
37
  question: str
38
 
39
- # βœ… System prompt
40
  SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
41
 
42
- # βœ… Streaming generator
43
  async def generate_response_chunks(prompt: str):
44
  qwen_prompt = (
45
  f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
46
  f"<|im_start|>user\n{prompt}<|im_end|>\n"
47
  f"<|im_start|>assistant\n"
48
  )
49
-
50
- # Tokenize prompt
51
  inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
52
-
53
- # Generate output
54
  outputs = model.generate(
55
  **inputs,
56
  max_new_tokens=256,
@@ -59,17 +55,13 @@ async def generate_response_chunks(prompt: str):
59
  top_p=0.9,
60
  pad_token_id=tokenizer.eos_token_id
61
  )
62
-
63
- # Decode output
64
  full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
  reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
66
-
67
- # Yield chunks word by word (simulating stream)
68
  for word in reply.split():
69
  yield word + " "
70
- await asyncio.sleep(0.01) # slight delay for streaming effect
71
 
72
- # βœ… POST endpoint
73
  @app.post("/ask")
74
  async def ask(question: Question):
75
  return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain")
 
7
  import os
8
  import asyncio
9
 
10
+ # βœ… Use writable temp dir for Hugging Face cache
11
+ os.environ["HF_HOME"] = "/tmp/hf_home"
12
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
13
 
14
+ # βœ… Load model and tokenizer
15
  model_name = "Qwen/Qwen2.5-0.5B-Instruct"
16
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
17
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
18
 
19
+ # βœ… Use CUDA if available
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  model.to(device)
22
 
23
+ # βœ… Initialize FastAPI
24
  app = FastAPI()
25
 
26
+ # βœ… Enable CORS
27
  app.add_middleware(
28
  CORSMiddleware,
29
  allow_origins=["*"],
 
32
  allow_headers=["*"],
33
  )
34
 
35
+ # βœ… Input data model
36
  class Question(BaseModel):
37
  question: str
38
 
39
+ # βœ… Instructional system prompt
40
  SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
41
 
42
+ # βœ… Streaming response generator
43
  async def generate_response_chunks(prompt: str):
44
  qwen_prompt = (
45
  f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
46
  f"<|im_start|>user\n{prompt}<|im_end|>\n"
47
  f"<|im_start|>assistant\n"
48
  )
 
 
49
  inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
 
 
50
  outputs = model.generate(
51
  **inputs,
52
  max_new_tokens=256,
 
55
  top_p=0.9,
56
  pad_token_id=tokenizer.eos_token_id
57
  )
 
 
58
  full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
59
  reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
 
 
60
  for word in reply.split():
61
  yield word + " "
62
+ await asyncio.sleep(0.01)
63
 
64
+ # βœ… API route
65
  @app.post("/ask")
66
  async def ask(question: Question):
67
  return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain")