finaltry / main.py
abdullahalioo's picture
Update main.py
c58dbd5 verified
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Define model ID
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
# Download model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto", # Use "cpu" if you want to force CPU: device_map="cpu"
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # GPU: float16, CPU: float32
trust_remote_code=True
)
model.eval()
# Initialize FastAPI
app = FastAPI()
# CORS settings
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request model
class Question(BaseModel):
question: str
# Generate response chunks
async def generate_response_chunks(prompt: str):
try:
# Define system prompt
system_prompt = (
"You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore."
)
full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"
# Tokenize input
input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device)
# Generate output
output_ids = model.generate(
**input_ids,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1
)
# Decode output
output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True)
# Stream output letter-by-letter
for letter in output_text:
yield letter
except Exception as e:
yield f"Error occurred: {e}"
# API Endpoint
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(
generate_response_chunks(question.question),
media_type="text/plain"
)