Spaces:
Runtime error
Runtime error
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.responses import StreamingResponse | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
# Define model ID | |
model_id = "Qwen/Qwen2.5-VL-7B-Instruct" | |
# Download model and tokenizer locally | |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", # Use "cpu" if you want to force CPU: device_map="cpu" | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # GPU: float16, CPU: float32 | |
trust_remote_code=True | |
) | |
model.eval() | |
# Initialize FastAPI | |
app = FastAPI() | |
# CORS settings | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Request model | |
class Question(BaseModel): | |
question: str | |
# Generate response chunks | |
async def generate_response_chunks(prompt: str): | |
try: | |
# Define system prompt | |
system_prompt = ( | |
"You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore." | |
) | |
full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:" | |
# Tokenize input | |
input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device) | |
# Generate output | |
output_ids = model.generate( | |
**input_ids, | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
repetition_penalty=1.1 | |
) | |
# Decode output | |
output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True) | |
# Stream output letter-by-letter | |
for letter in output_text: | |
yield letter | |
except Exception as e: | |
yield f"Error occurred: {e}" | |
# API Endpoint | |
async def ask(question: Question): | |
return StreamingResponse( | |
generate_response_chunks(question.question), | |
media_type="text/plain" | |
) | |