abdullahalioo commited on
Commit
2ba12d8
Β·
verified Β·
1 Parent(s): f7b7ed5

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +48 -27
main.py CHANGED
@@ -7,33 +7,38 @@ import torch
7
  import os
8
  import asyncio
9
 
10
- # βœ… Set all cache directories to a writable location
11
  cache_dir = "/tmp/hf_home"
12
  os.environ["HF_HOME"] = cache_dir
13
  os.environ["TRANSFORMERS_CACHE"] = cache_dir
14
  os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
15
 
16
- # βœ… Create cache directory with proper permissions
17
  os.makedirs(cache_dir, exist_ok=True)
18
- os.chmod(cache_dir, 0o777) # Make writable by all
19
 
20
- # βœ… Load model and tokenizer
21
  model_name = "Qwen/Qwen2.5-0.5B-Instruct"
22
- try:
23
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=cache_dir)
24
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, cache_dir=cache_dir)
25
- except Exception as e:
26
- print(f"Error loading model: {e}")
27
- raise
 
 
 
 
 
28
 
29
- # βœ… Use CUDA if available
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
  model.to(device)
32
 
33
- # βœ… Initialize FastAPI
34
  app = FastAPI()
35
 
36
- # βœ… Enable CORS
37
  app.add_middleware(
38
  CORSMiddleware,
39
  allow_origins=["*"],
@@ -42,36 +47,52 @@ app.add_middleware(
42
  allow_headers=["*"],
43
  )
44
 
45
- # βœ… Input data model
46
  class Question(BaseModel):
47
  question: str
48
 
49
- # βœ… Instructional system prompt
50
  SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
51
 
52
- # βœ… Streaming response generator
53
  async def generate_response_chunks(prompt: str):
54
- qwen_prompt = (
55
- f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
56
- f"<|im_start|>user\n{prompt}<|im_end|>\n"
57
- f"<|im_start|>assistant\n"
 
 
 
 
 
 
 
58
  )
 
 
59
  inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
60
  outputs = model.generate(
61
  **inputs,
62
- max_new_tokens=256,
63
  do_sample=True,
64
  temperature=0.7,
65
  top_p=0.9,
66
  pad_token_id=tokenizer.eos_token_id
67
  )
68
- full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
69
- reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
70
- for word in reply.split():
 
 
 
 
 
 
71
  yield word + " "
72
- await asyncio.sleep(0.01)
73
 
74
- # βœ… API route
75
  @app.post("/ask")
76
  async def ask(question: Question):
77
- return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain")
 
 
 
 
7
  import os
8
  import asyncio
9
 
10
+ # Set cache directories
11
  cache_dir = "/tmp/hf_home"
12
  os.environ["HF_HOME"] = cache_dir
13
  os.environ["TRANSFORMERS_CACHE"] = cache_dir
14
  os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
15
 
16
+ # Create cache directory with proper permissions
17
  os.makedirs(cache_dir, exist_ok=True)
18
+ os.chmod(cache_dir, 0o777)
19
 
20
+ # Load model and tokenizer
21
  model_name = "Qwen/Qwen2.5-0.5B-Instruct"
22
+ tokenizer = AutoTokenizer.from_pretrained(
23
+ model_name,
24
+ trust_remote_code=True,
25
+ cache_dir=cache_dir
26
+ )
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ model_name,
29
+ trust_remote_code=True,
30
+ cache_dir=cache_dir,
31
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
32
+ )
33
 
34
+ # Set device
35
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
  model.to(device)
37
 
38
+ # Initialize FastAPI
39
  app = FastAPI()
40
 
41
+ # Enable CORS
42
  app.add_middleware(
43
  CORSMiddleware,
44
  allow_origins=["*"],
 
47
  allow_headers=["*"],
48
  )
49
 
50
+ # Input model
51
  class Question(BaseModel):
52
  question: str
53
 
54
+ # System prompt
55
  SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
56
 
 
57
  async def generate_response_chunks(prompt: str):
58
+ # Create the chat template
59
+ messages = [
60
+ {"role": "system", "content": SYSTEM_PROMPT},
61
+ {"role": "user", "content": prompt}
62
+ ]
63
+
64
+ # Apply chat template
65
+ qwen_prompt = tokenizer.apply_chat_template(
66
+ messages,
67
+ tokenize=False,
68
+ add_generation_prompt=True
69
  )
70
+
71
+ # Tokenize and generate
72
  inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
73
  outputs = model.generate(
74
  **inputs,
75
+ max_new_tokens=512,
76
  do_sample=True,
77
  temperature=0.7,
78
  top_p=0.9,
79
  pad_token_id=tokenizer.eos_token_id
80
  )
81
+
82
+ # Decode and clean the output
83
+ full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
84
+
85
+ # Extract only the assistant's response
86
+ response = full_output[len(qwen_prompt):].split(tokenizer.eos_token)[0].strip()
87
+
88
+ # Stream the response
89
+ for word in response.split():
90
  yield word + " "
91
+ await asyncio.sleep(0.05)
92
 
 
93
  @app.post("/ask")
94
  async def ask(question: Question):
95
+ return StreamingResponse(
96
+ generate_response_chunks(question.question),
97
+ media_type="text/plain"
98
+ )