Spaces:
Runtime error
Runtime error
File size: 2,649 Bytes
0218c20 03991d8 0218c20 29331bd 6a547e4 29331bd 2ba12d8 6f6ae2a 2ba12d8 6f6ae2a 2ba12d8 29331bd 2ba12d8 0218c20 2ba12d8 0218c20 2ba12d8 6a547e4 0218c20 03991d8 2ba12d8 03991d8 2ba12d8 03991d8 0218c20 03991d8 2ba12d8 03991d8 2ba12d8 0218c20 03991d8 2ba12d8 0218c20 2ba12d8 0218c20 2ba12d8 0218c20 2ba12d8 6a547e4 2ba12d8 03991d8 2ba12d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import asyncio
# Set cache directories
cache_dir = "/tmp/hf_home"
os.environ["HF_HOME"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
# Create cache directory with proper permissions
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
# Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
cache_dir=cache_dir,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Initialize FastAPI
app = FastAPI()
# Enable CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Input model
class Question(BaseModel):
question: str
# System prompt
SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."
async def generate_response_chunks(prompt: str):
# Create the chat template
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}
]
# Apply chat template
qwen_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize and generate
inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
outputs = model.generate(
**inputs,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# Decode and clean the output
full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
# Extract only the assistant's response
response = full_output[len(qwen_prompt):].split(tokenizer.eos_token)[0].strip()
# Stream the response
for word in response.split():
yield word + " "
await asyncio.sleep(0.05)
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(
generate_response_chunks(question.question),
media_type="text/plain"
) |