finaltry

Runtime error

File size: 2,168 Bytes

c58dbd5
03991d8
 
 
 
c58dbd5
 
03991d8
c58dbd5
 
 
 
 
 
 
 
 
 
 
 
 
 
03991d8
 
c58dbd5
03991d8
 
c58dbd5
03991d8
 
 
 
 
c58dbd5
03991d8
 
 
c58dbd5
03991d8
 
c58dbd5
 
 
03991d8
c58dbd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03991d8
c58dbd5
 
 
03991d8
 
 
c58dbd5
03991d8
 
 
 
 
c58dbd5

from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define model ID
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

# Download model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",  # Use "cpu" if you want to force CPU: device_map="cpu"
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # GPU: float16, CPU: float32
    trust_remote_code=True
)
model.eval()

# Initialize FastAPI
app = FastAPI()

# CORS settings
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request model
class Question(BaseModel):
    question: str

# Generate response chunks
async def generate_response_chunks(prompt: str):
    try:
        # Define system prompt
        system_prompt = (
            "You are a Orion AI assistant created by Abdullah Ali who is very intelligent and he is 13 years old and lives in Lahore."
        )
        full_prompt = f"{system_prompt}\n\nUser: {prompt}\nAssistant:"

        # Tokenize input
        input_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device)

        # Generate output
        output_ids = model.generate(
            **input_ids,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1
        )

        # Decode output
        output_text = tokenizer.decode(output_ids[0][input_ids.input_ids.shape[-1]:], skip_special_tokens=True)

        # Stream output letter-by-letter
        for letter in output_text:
            yield letter
    except Exception as e:
        yield f"Error occurred: {e}"

# API Endpoint
@app.post("/ask")
async def ask(question: Question):
    return StreamingResponse(
        generate_response_chunks(question.question),
        media_type="text/plain"
    )