finaltry / main.py
abdullahalioo's picture
Update main.py
c58dbd5 verified
raw
history blame
2.17 kB
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Define model ID
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
# Download model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto", # Use "cpu" if you want to force CPU: device_map="cpu"
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # GPU: float16, CPU: float32
trust_remote_code=True
)
model.eval()
# Initialize FastAPI
app = FastAPI()
# CORS settings
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request model
class Question(BaseModel):
question: str
# Generate response chunks
async def generate_response_chunks(prompt: str):
try:
# Define system prompt
system_prompt = (
"You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore."
)
full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"
# Tokenize input
input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device)
# Generate output
output_ids = model.generate(
**input_ids,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1
)
# Decode output
output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True)
# Stream output letter-by-letter
for letter in output_text:
yield letter
except Exception as e:
yield f"Error occurred: {e}"
# API Endpoint
@app.post("/ask")
async def ask(question: Question):
return StreamingResponse(
generate_response_chunks(question.question),
media_type="text/plain"
)