Spaces:
Sleeping
Sleeping
File size: 2,453 Bytes
0218c20 03991d8 0218c20 29331bd 6a547e4 29331bd 6f6ae2a 29331bd 463f46a 0218c20 6f6ae2a 0218c20 463f46a 6a547e4 0218c20 03991d8 463f46a 03991d8 463f46a 03991d8 0218c20 03991d8 463f46a 03991d8 463f46a 0218c20 463f46a 03991d8 0218c20 6a547e4 463f46a 03991d8 463f46a 03991d8 f7b7ed5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import asyncio
# β
Set all cache directories to a writable location
cache_dir = "/tmp/hf_home"
os.environ["HF_HOME"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
# β
Create cache directory with proper permissions
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777) # Make writable by all
# β
Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, cache_dir=cache_dir)
except Exception as e:
print(f"Error loading model: {e}")
raise
# β
Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# β
Initialize FastAPI
app = FastAPI()
# β
Enable CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# β
Input data model
class Question(BaseModel):
question: str
# β
Instructional system prompt
SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
# β
Streaming response generator
async def generate_response_chunks(prompt: str):
qwen_prompt = (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{prompt}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
reply = full_output.split("<|im_start|>assistant\n")[-1].strip()
for word in reply.split():
yield word + " "
await asyncio.sleep(0.01)
# β
API route
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(generate_response_chunks(question.question), media_type="text/plain") |