Spaces:
Runtime error
Runtime error
File size: 2,168 Bytes
c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 03991d8 c58dbd5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Define model ID
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
# Download model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto", # Use "cpu" if you want to force CPU: device_map="cpu"
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # GPU: float16, CPU: float32
trust_remote_code=True
)
model.eval()
# Initialize FastAPI
app = FastAPI()
# CORS settings
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request model
class Question(BaseModel):
question: str
# Generate response chunks
async def generate_response_chunks(prompt: str):
try:
# Define system prompt
system_prompt = (
"You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore."
)
full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"
# Tokenize input
input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device)
# Generate output
output_ids = model.generate(
**input_ids,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1
)
# Decode output
output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True)
# Stream output letter-by-letter
for letter in output_text:
yield letter
except Exception as e:
yield f"Error occurred: {e}"
# API Endpoint
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(
generate_response_chunks(question.question),
media_type="text/plain"
)
|